def train(self, features_indep_df: PandasDataFrame, feature_target: List, model_labals: List = [0, 1], **kwargs: Any) -> StatsmodelsMixedLM: """Perform the training, using the Mixed Linear Model. :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param model_labals: the target labels (default [0, 1]). :param kwargs: any other arguments that the selected reader may accept. :return: the trained model. """ self._logger.debug("Train " + __name__) if 'groups' not in kwargs.keys(): self._logger.error(__name__ + " - " + " function argument is missing: 'groups'.") sys.exit() groups = features_indep_df[kwargs['groups']] exog = features_indep_df.drop(kwargs['groups'], axis=1) exog['Intercept'] = 1 model_train = sm.MixedLM(endog=feature_target, exog=exog, groups=groups, exog_re=exog['Intercept']) model_train = model_train.fit() print(model_train.summary()) return model_train
def run_mm(trunc_data, out_data_array, exog_vars, groupVar, i): print(i) try: out_data_array = sm.MixedLM(trunc_data, exog_vars, groupVar).fit().resid except ValueError: print("Error %d" % i) out_data_array = np.zeros((len(exog_vars))) return out_data_array
def run_one_lmm(genotypes, phenotypes, groups): try: intercept = np.ones(genotypes.size) genotypes = genotypes.copy() x = np.stack([intercept, genotypes], axis=1) # Return p-value for genotype coefficient return sm.MixedLM(phenotypes, x, groups).fit().pvalues[1] except np.linalg.LinAlgError: # Could not fit model, return NaN return float('nan')
def fit(self, y, X): """Fit the model. Args: y (pandas.DataFrame): The vector of endogenous variable. X (pandas.DataFrame): The matrix of exogenous variables. """ # Retrieving the data y, X, groups = self._prepare_data(y, X) # Creating the MixedLM model from StatsModels and fitting it model = sm.MixedLM(y, X, groups) try: # fitted = model.fit(reml=self._reml) fitted = model.fit(reml=self._reml) except np.linalg.linalg.LinAlgError as e: raise StatsError(str(e)) out = {} parameters = fitted.params.index # Results about the model fit out = { "MODEL": { "log_likelihood": fitted.llf, "nobs": X.shape[0], "random_effects": self._format_re(fitted.random_effects), }, } # Getting the confidence intervals conf_ints = fitted.conf_int() for param in parameters: # If GWAS, check that inference could be done on the SNP if param == "SNPs" and np.isnan(fitted.pvalues[param]): raise StatsError("Inference did not converge.") out[param] = { "coef": fitted.params[param], "std_err": fitted.bse[param], "lower_ci": conf_ints.loc[param, 0], "upper_ci": conf_ints.loc[param, 1], "z_value": fitted.tvalues[param], "p_value": fitted.pvalues[param], } return out
def crude_mixedML2(df_merged, x_feature, y_feature, covars): #TODO: Replace covars variable with actual selection of indivdual features df_merged = df_merged.replace(-9, np.nan).replace('-9', np.nan).replace( 999, np.nan).replace(888, np.nan) split_covars = covars.split('|') print(split_covars) data = add_confound(df_merged, x_feature, y_feature, split_covars) data['intercept'] = 1 print(data.columns) #data = data.select_dtypes(include = ['float','integer']) X = data[[x for x in data.columns if x != y_feature and x != 'CohortType']] Y = data[y_feature] if X.shape[0] > 2: reg = sm.MixedLM(Y, X, groups=data["CohortType"], exog_re=X["intercept"]).fit() ret = reg.summary() else: ret = 'error' fit_string = y_feature + '~' for x in X.columns: fit_string += ' + ' + str(x) fit_string = fit_string.replace('~ +', '~') + ' + (1|CohortType)' header = '<div> <b> Liear Mixed Model with Random Intercept </b> </div>' header += '<div> <b> Number samples: </b> ' + str(X.shape[0]) + '</div>' header += '<div> <b> Model: </b>' + fit_string + '</div>' header += '<div> <b> Group: </b> CohortType ' htmls = header + ret.tables[0].to_html() + ret.tables[1].to_html() return htmls
def run_per_voxel(df, from_regress, labels): y_predicted_all = np.zeros((df.shape[0], )) kf = KFold(n_splits=5, shuffle=True) data = pd.concat([df, from_regress], axis=1) data = data.dropna() indices = list(data.index) # reset valid indices from_regress = from_regress.loc[indices, ].reset_index(drop=True) df = df.loc[indices, ].reset_index(drop=True) for train_index, test_index in kf.split(df): # training_data = data.loc[train_index,].reset_index(drop=True) # testing_data = data.loc[test_index,].reset_index(drop=True) # training_y_groups = data_labels.loc[train_index,].reset_index(drop=True) # print(training_data.shape) # print(training_y_groups.shape) # prepare data training_X = from_regress.loc[train_index, ].reset_index(drop=True) training_y = df.loc[train_index, ]['activations'].reset_index( drop=True) training_y_groups = df.loc[ train_index, ]['subject_number'].reset_index(drop=True) testing_X = from_regress.loc[test_index, ].reset_index(drop=True) testing_y = df.loc[test_index, ]['activations'].reset_index(drop=True) testing_y_groups = df.loc[test_index, ]['subject_number'].reset_index( drop=True) md = sm.MixedLM(endog=training_y, exog=training_X, groups=training_y_groups, exog_re=training_X) # func = 'activations ~ ' + str(labels) + '1' # re_form = str(labels)[:-2] # print(re_form) # print(func) # print(training_data.columns.values.tolist()) # md = smf.mixedlm(func, training_data, re_formula=re_form, groups=training_y_groups) mdf = md.fit() print(mdf.summary()) # print(testing_y.shape) y_hat_test = mdf.predict(testing_data) print("PREDICTION") print(y_hat_test[:10]) y_predicted_all[test_index] = y_hat_test # print(y_hat_test.shape) # print(np.sqrt(np.sum(np.abs(y_hat_test - testing_y)))) # print(asdf) y_true = df['activations'] print("PREDICTED SHAPE") print(y_predicted_all.shape) print(y_predicted_all[:10]) print("TRUE SHAPE") print(y_true.shape) print(y_true[:10]) rmse = np.sqrt(np.sum(np.abs(y_predicted_all - y_true))) print("RMSE: " + str(rmse)) return rmse.astype(np.float32)
def processData(df_original, reg_mlme=True): """ Description: core processing of the data. It's divided in two main steps: step 1, apply VAR to the fixed effects wrt to each actor, step 2, apply LMEM to whole dataset and learn 5 different models for each of the labels Input: dataframe transformed with the whole history Output: dataframe with the forecast for each participant. """ df_flat = df_original.reset_index() # Actors definition #count participants anre remove 10 actors = df_flat.actorId.unique().tolist() if 10 in actors: actors.remove(10) # remove user no.10 (insufficient info) # Attributes definition #categoricals = ['MainActivity','lat','lng','weatherId'] activities = list(df_original.ix[:, 19:].columns.values) random_effects = activities + ['Steps'] fixed_effects = [ 'pressure', 'temp', 'humidity', 'hr_min', 'hr_avc', 'hr_mean', 'hr_std', 'hr_max', 'timeframe' ] labels = ['Abilities', 'Challenge', 'Productivity', 'Stress', 'Flow'] # Dataframe to remember min e max for each user target_min = pd.DataFrame(np.nan, index=actors, columns=labels) target_max = pd.DataFrame(np.nan, index=actors, columns=labels) target_mean = pd.DataFrame(np.nan, index=actors, columns=labels) target_std = pd.DataFrame(np.nan, index=actors, columns=labels) # STEP 1) VAR on fixed effects #------------------------------- window = 5 # Windows to predict df_future = pd.DataFrame() #prepare the future dataframe var_attributes = [ item for item in fixed_effects if item not in ['timeframe'] ] for user in actors: print "7.1 ----- Forecasting actor ARLearn" + str(user) df_user = df_original.xs(user, level='actorId') df = df_user[var_attributes] VARres = VARprocess(df) forecasts = VARforecast(df, VARres, window) #plt = forecasts.plot() # prediction plot #plt.axvline(forecasts.index[-window]) forecasts['actorId'] = user forecasts['timeframe'] = forecasts.index.hour df_future = df_future.append(forecasts) # offtopic, add max and min for target in (labels): target_min[target][user] = min(df_user[target]) target_max[target][user] = max(df_user[target]) target_mean[target][user] = df_user[target].mean() target_std[target][user] = df_user[target].std() # add intercept term df_future['Intercept'] = 1 df_future = df_future.reset_index().set_index(['index', 'actorId']).sort_index() # ------------------------------- end VAR # STEP 2) Linear Mixed Effect Model # ------------------------------- data = df_flat data['intercept'] = 1 # set the intercept term LMEM_models = [] # create a list of models, for multi output exog = data[fixed_effects + ['intercept']] # the attributes from which to predict exog_re = data[random_effects] # random effects groups = data['actorId'] # group definition # Training phase of four model, one per each label for target in labels: endog = data[target] # endogenous, ie the values we want to predict if ((reg_mlme == False) and os.path.exists('model_' + target + '.pickle')): LMEM_results = pickle.load( open('model_' + target + '.pickle', 'rb')) LMEM_models.append(LMEM_results) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore") LMEM_model = sm.MixedLM(endog, exog, groups=groups, exog_re=exog_re) LMEM_results = LMEM_model.fit() LMEM_results.save('model_' + target + '.pickle', remove_data=False) LMEM_models.append(LMEM_results) # Coefficients importance averaging coeff = pd.DataFrame(index=range(0, len(exog.T)), data={ 'coefficients': 0.0 }, dtype='float').coefficients for i in range(0, len(coeff)): for j in range(0, len(LMEM_models)): coeff[i] = coeff[i] + LMEM_models[j].fe_params[i] coeff = coeff / len(LMEM_models) # Test phase for each of the four models df = df_future.reset_index() exog = df[fixed_effects] exog['intercept'] = 1 for i in range(0, len(labels)): t = labels[i] df[t] = LMEM_models[i].predict(exog) # normalization for u in actors: actual = df[df['actorId'] == u][t] rindex = df[df['actorId'] == u][t].index # Normalization (x_max-x_min)*(x_i/100)+x_min df.loc[rindex, t] = (target_max[t][u] - target_min[t][u]) * (actual / 100) + target_min[t][u] df[t] = df[t].astype('int') df = df.rename(columns={'index': 'timestamp'}) return df
# This is one of the example data sets provided in the LMER R library. # The outcome variable is the size of the tree, and the covariate used here # is a time value. The data are grouped by tree. data = sm.datasets.get_rdataset("Sitka", "MASS").data endog = data["size"] data["Intercept"] = 1 exog = data[["Intercept", "Time"]] # Here is the statsmodels LME fit for a basic model with a random # intercept. We are passing the endog and exog data directly to the LME # init function as arrays. Also note that endog_re is specified explicitly # in argument 4 as a random intercept (although this would also be the # default if it were not specified). md = sm.MixedLM(endog, exog, groups=data["tree"], exog_re=exog["Intercept"]) mdf = md.fit() print(mdf.summary()) # Here is the same model fit in R using LMER: # ```ipython # %R # data(Sitka, package="MASS") # print(summary(lmer("size ~ Time + (1 | tree)", data=Sitka))) # ``` # ``` # Linear mixed model fit by REML ['lmerMod'] # Formula: size ~ Time + (1 | tree) # Data: Sitka
def causal_simulation(path,start_date,f_start_date,datafile="dataset_full.csv",govpolicyfile="gov_dates_mandates.csv", num_date_omit=0, print_graph=True): data = pd.read_csv(path+"/"+datafile) start_dt = datetime.strptime(start_date, '%m/%d/%y').strftime('%Y-%m-%d') print(start_dt) dateval = pd.date_range(start_dt, periods=horizon+180).tolist() dates = pd.DataFrame({'dateval': dateval}) dates['dateval'] = dates['dateval'].apply(lambda x: datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') ) data['dateval'] = data['date'].apply(lambda x: datetime.strptime(str(x),'%Y%m%d' ).strftime('%Y-%m-%d') ) if num_date_omit > 0: temp_start_date = datetime.strptime(f_start_date, '%m/%d/%y') - timedelta(days=30) temp_start_date = int(temp_start_date.strftime('%Y%m%d')) print(temp_start_date) temp = data.loc[(data['confirmed'].isna())&(data['date']>temp_start_date), ['location_name','confirmed','dateval']].sort_values(by='dateval').reset_index(drop=True) temp_start_date = temp.loc[0, 'dateval'] temp_start_training_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit)).strftime('%Y-%m-%d') print(temp_start_training_date) temp_start_simulation_date = (datetime.strptime(temp_start_date, '%Y-%m-%d') - timedelta(days=num_date_omit-1)).strftime('%Y-%m-%d') data['state'] = data['province_state'] zx = 0 data = psql.sqldf(""" select province_state , country_region as country, date, confirmed, recovered, death, population, TAVG/10 as TAVG, a1.location_name, a1.dateval, country_region ||'-'||state as state, case when TAVG<=0 then 1 else 0 end as is_freezing, case when TAVG>0 and TAVG/10<20 then 1 else 0 end as is_cold, case when TAVG>=20 and TAVG/10<35 then 1 else 0 end as is_warm, case when TAVG>=35 then 1 else 0 end as is_hot, case when TAVG>=20 then 1 else 0 end as temp_th, case when julianday(a1.dateval)>julianday('2020-03-20') then 1 else 0 end as gov_action from data a1 """).drop_duplicates() data['Intercept'] = 1.0 data = data[(data['dateval']>=start_dt)] data['holdout'] = np.where((data['dateval']>=datetime.strptime(f_start_date, '%m/%d/%y').strftime('%Y-%m-%d')),1,0) print(data) data_save = data.copy() # data smoothing to correct irregular data issues: like dropped cumulative values data1 = pd.DataFrame() z = 0 for state in data['state'].drop_duplicates(): dat = data[(data['state']==state)].sort_values(by=['dateval']) if len(dat['dateval'])>1: dat = dat.fillna(0) dat = dat.loc[dat['confirmed'].ne(0.0).idxmax():] rho_data = dat[dat['confirmed']>0].sort_values(by=['dateval']) rho_data = rho_data[0:30] zz1 = 0.0 zz2 = 0.0 zz3 = 0.0 for s in range(len(rho_data['dateval'])): if (s>0): if rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1]>0: zz1 = zz1 + rho_data['confirmed'].values[s]-rho_data['confirmed'].values[s-1] if rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1]>0: zz2 = zz2 + rho_data['recovered'].values[s]-rho_data['recovered'].values[s-1] if rho_data['death'].values[s]-rho_data['death'].values[s-1]>0: zz3 = zz3 + rho_data['death'].values[s]-rho_data['death'].values[s-1] rho = 0.0 if (zz2+zz3) >0.0: rho = (zz1+zz2+zz3)/(zz2+zz3) print("R_0 for "+state +" : "+str(rho)) dat['lag_confirmed'] = 0.0 dat['lag_recovered'] = 0.0 dat['lag_death'] = 0.0 dat['lag_removed'] = 0.0 dat['d_recovered'] = 0.0 dat['d_death'] = 0.0 dat['d_removed'] = 0.0 N = dat['population'].values[0] dd = dat[dat['confirmed']>1] dat['R_0'] = rho z1 = 0 tt = 1 dat['removed'] = dat['death'] + dat['recovered'] for t in range(len(dat)): if t>0 and t<=len(dat): dat['lag_confirmed'].values[t] = dat['confirmed'].values[t-1] dat['lag_removed'].values[t] = dat['removed'].values[t-1] dat['lag_recovered'].values[t] = dat['recovered'].values[t-1] dat['lag_death'].values[t] = dat['death'].values[t-1] dat['d_recovered'].values[t] = dat['recovered'].values[t] - dat['recovered'].values[t-1] dat['d_death'].values[t] = dat['death'].values[t]-dat['death'].values[t-1] dat['d_removed'].values[t] = dat['removed'].values[t]-dat['removed'].values[t-1] data1 = data1.append(dat,ignore_index=True) z = z +1 data = data1.copy() rb = np.mean(data[data['R_0']>0]['R_0']) data['R_0'] = np.where(data['R_0']==0,rb,data['R_0']) data = data.fillna(0) data.to_csv("output/simulation_output/input_data.csv") if num_date_omit > 0: data_train = data[(data['removed']>0) & ((data['holdout']==0) | (data['dateval']<=temp_start_training_date))][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']] print(temp_start_training_date) print(max(data_train['dateval'])) else: data_train = data[(data['removed']>0) & (data['holdout']==0)][['dateval','Intercept','state','TAVG','gov_action','is_freezing','is_cold','is_warm','is_hot','lag_confirmed','lag_death','lag_recovered','d_death','d_recovered','d_removed','removed']] endog =data_train['d_removed'] exog = data_train[[ 'Intercept','gov_action','TAVG','lag_confirmed']] model = sm.MixedLM(endog, exog, exog_re=exog[[ 'Intercept','lag_confirmed']], groups=data_train["state"]) po_results = model.fit() print(po_results.summary()) # Get Coefficient k = [] v1 = [] v2 = [] v3 = [] d = po_results.random_effects for i in d: my_str = ''.join((ch if ch in '0123456789.-' else ' ') for ch in str(d[i])) listOfNumbers = [float(i) for i in my_str.split()] v1 = v1 +[str(d[i]).split(" ")[0] ] l = str(d[i]).split(" ") if str(listOfNumbers[0]).strip()=='': v2 = v2 +[0.00 ] v3 = v3 + [0.0] else: v2 = v2 +[listOfNumbers[0] ] v3 = v3 +[listOfNumbers[1] ] k = k + [i] r_combined = pd.DataFrame({'state':k,'coef_name':v1,'coef_value':v2,'re_lag_confirmed':v3}) r_combined['fe_Intercept'] = po_results.fe_params['Intercept'] r_combined['Intercept'] = r_combined['fe_Intercept']+r_combined['coef_value'] r_combined['lag_confirmed'] = po_results.fe_params['lag_confirmed'] +r_combined["re_lag_confirmed"] r_combined['gov_action'] = po_results.fe_params['gov_action'] r_combined['TAVG'] = po_results.fe_params['TAVG'] r_combined.fillna(0.0) r_combined.to_csv("output/simulation_output/recover_coefs.csv") mean_beta = np.mean(r_combined[r_combined['lag_confirmed']>0]['lag_confirmed']) r_combined['lag_confirmed'] = np.where(r_combined['lag_confirmed']<0,mean_beta,r_combined['lag_confirmed']) # Get Prediction and Bias t_dat = generate_dataset(data, r_combined) if num_date_omit > 0: t_dat = t_dat[(t_dat['removed']>0) & ((t_dat['holdout']==0) | (t_dat['dateval']<=temp_start_training_date))] else: t_dat = t_dat[(t_dat['removed']>0) & (t_dat['holdout']==0)] pred_on_train = runSimulator(data1=t_dat, coefsdfR=r_combined, sir_names=['susceptible','confirmed','death','removed'], xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'], horizon1=60, date_gov_adjust=0, print_graph=print_graph) # == Adjust Prediction of Removed with Bias pred_on_train['bias_removed'] = pred_on_train['pred_removed'] - pred_on_train['removed'] # pred_on_train.to_csv('output/simulation_output/pred_on_train.csv') mean_bias = pred_on_train.groupby('location_name')['bias_removed'].mean().reset_index() mean_bias.to_csv('output/simulation_output/bias.csv') loc_list = set(pred_on_train['location_name']) for loc in loc_list: bias = mean_bias.loc[mean_bias['location_name']==loc, 'bias_removed'].iloc[0] if bias > 0: pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']-bias elif bias < 0: pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed'] = pred_on_train.loc[pred_on_train['location_name']==loc, 'pred_removed']+bias loc_name = "".join(c for c in loc if c.isalnum()) temp_for_plot = pred_on_train.loc[pred_on_train['location_name']==loc, ['dateval', 'removed', 'pred_removed']] plt.figure(figsize=(12,12)) fig = temp_for_plot.plot(x='dateval', y=['removed', 'pred_removed'], rot=45, ax=plt.gca()).get_figure() fig.savefig(os.path.join('output/covid_plot/actual_pred/covid_plot_compare_'+loc_name+'.png')) fig.clf() pred_on_train['pred_removed'] = np.where(pred_on_train['pred_removed']<0, 0, pred_on_train['pred_removed']) pred_on_train.to_csv('output/simulation_output/adjusted_pred_on_train.csv') print(pred_on_train) # Adjust R-Combined with Bias mean_bias2 = mean_bias.copy() r_combined2 = r_combined.copy() mean_bias2['location_name'] = mean_bias2['location_name'].str.replace('[^a-zA-Z]', '') r_combined2['state'] = r_combined2['state'].str.replace('[^a-zA-Z]', '') loc_list = set(mean_bias2['location_name']) for loc in loc_list: bias = mean_bias2.loc[mean_bias2['location_name']==loc, 'bias_removed'].iloc[0] if bias > 0: r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']-bias elif bias < 0: r_combined2.loc[r_combined2['state']==loc, 'Intercept'] = r_combined2.loc[r_combined2['state']==loc, 'Intercept']+bias r_combined = r_combined2 tti = 0 states = data['state'].drop_duplicates().str.replace('[^a-zA-Z]', '') data2 = data.copy() data2['state'] = data2['state'].str.replace('[^a-zA-Z]', '') data3 = pd.DataFrame() for s in states: rc = r_combined[r_combined['state']==s] #print(rc) dat = data2[data2['state']==s] if len(dat)>0 and len(rc)>0: print(s) dat = dat.sort_values(by=['dateval']) #dat = dat.reset_index() beta = rc['lag_confirmed'].values[0] N = dat['population'].values[0] dat['susceptible'] = np.where(dat['holdout']==0, N+0.0,0.0) alpha = dat['R_0'].values[0]*beta dat['alpha'] = alpha if tti==0: data3 = dat else: data3 = data3.append(dat,ignore_index=True ) tti = tti +1 print(data3) print(data3['state'].drop_duplicates()) # data3.to_csv(path+"/before_sim_data_test.csv") #runDynamicSimulator #runSimulator(data1,coefsdfR,sir_names,xnamesr,horizon1) date_start_sim = 20200510 if num_date_omit > 0: sim_data = data3[(data3['holdout']==1)&(data3['dateval']>=temp_start_simulation_date)] print(temp_start_simulation_date) print(min(sim_data['dateval'])) else: sim_data = data3[(data3['holdout']==1)] sim_data_output_after = runSimulator(data1=sim_data, coefsdfR=r_combined, sir_names=['susceptible','confirmed','death','removed'], xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'], horizon1=60, date_gov_adjust=date_start_sim, print_graph=print_graph) sim_data_output_after.to_csv("output/simulation_output/simulations_after_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv") sim_data_output_before = runSimulator(data1=sim_data, coefsdfR=r_combined, sir_names=['susceptible','confirmed','death','removed'], xnamesr=['Intercept','gov_action','TAVG','lag_confirmed'], horizon1=60, date_gov_adjust=0, print_graph=print_graph) sim_data_output_before.to_csv("output/simulation_output/simulations_before_adjust_at_"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv") sim_data_compare = sim_data_output_after.merge(sim_data_output_before, on=['index', 'province_state', 'country','date','dateval','location_name'], suffixes=('_after', '_before')) sim_data_compare['diff_susceptible'] = sim_data_compare['pred_susceptible_after'] - sim_data_compare['pred_susceptible_before'] sim_data_compare['diff_confirmed'] = sim_data_compare['pred_confirmed_after'] - sim_data_compare['pred_confirmed_before'] sim_data_compare['diff_removed'] = sim_data_compare['pred_removed_after'] - sim_data_compare['pred_removed_before'] sim_data_compare = sim_data_compare.loc[:, ['province_state', 'country','date','dateval','location_name','pred_susceptible_after', 'pred_confirmed_after', 'pred_removed_after', 'pred_susceptible_before', 'pred_confirmed_before', 'pred_removed_before', 'diff_susceptible', 'diff_confirmed', 'diff_removed']] sim_data_compare.to_csv("output/simulation_output/simulations_compare"+str(date_start_sim)+"_omitlastD_"+str(num_date_omit)+".csv") if print_graph == True: for location in sim_data_compare['location_name'].drop_duplicates(): dat = sim_data_compare[(sim_data_compare['location_name']==location)].sort_values(by=['date']) # plot results plt.figure(1) plt.figure(figsize=(15,10)) xtick_locator = AutoDateLocator() xtick_formatter = AutoDateFormatter(xtick_locator) date_list = pd.to_datetime(dat['dateval']) ax = plt.axes() ax.xaxis.set_major_locator(xtick_locator) ax.xaxis.set_major_formatter(xtick_formatter) plt.plot(date_list,dat['diff_susceptible'],'b-') plt.plot(date_list,dat['diff_confirmed'],'r--') plt.plot(date_list,dat['diff_removed'],'g--') plt.xlabel('Time') plt.ylabel('Populations') plt.title('Compare Before/After Gov. Intervention Adjust at '+''.join(e for e in location if e.isalnum())+' : '+str(date_start_sim)) plt.legend(['Diff Suceptibes','Diff Confirmed','Diff Removed']) plt.savefig(os.path.join('output/covid_plot/covid_plot_compare_'+"omitlastD_"+str(num_date_omit)+''.join(e for e in location if e.isalnum())+'_'+str(date_start_sim)+'.png')) plt.clf() plt.close()
def run(opts): indexCol = opts.indexcolumns[0] # read csv(s) num_csv = len(opts.inputcsv) pdCSV = pd.read_csv(opts.inputcsv[0], delimiter=',', index_col=[indexCol]) if num_csv > 1: for i in range(int(num_csv - 1)): tempCSV = pd.read_csv(opts.inputcsv[int(i + 1)], delimiter=',', index_col=[indexCol]) pdCSV = pd.concat([pdCSV, tempCSV], axis=1, join_axes=[pdCSV.index]) # Interaction Variables if opts.interactionvars: for int_terms in opts.interactionvars: inteaction_vars = int_terms.split("*") for scale_var in inteaction_vars: var_temp = scalearr(pdCSV[scale_var]) var_tempname = '%s_std' % scale_var if var_tempname in opts.exogenousvariables: pass else: pdCSV[var_tempname] = var_temp opts.exogenousvariables.append(var_tempname) for int_terms in opts.interactionvars: inteaction_vars = int_terms.split("*") for i, scale_var in enumerate(inteaction_vars): if i == 0: int_temp = pdCSV['%s_std' % scale_var] int_tempname = '%s' % scale_var else: int_temp = int_temp * pdCSV['%s_std' % scale_var] int_tempname = int_tempname + '.X.' + scale_var if int_tempname in opts.exogenousvariables: pass else: pdCSV[int_tempname] = int_temp opts.exogenousvariables.append(int_tempname) int_temp = None print opts.exogenousvariables # output column/variable names. if opts.outputcolumnnames: for counter, roi in enumerate(pdCSV.columns): print("[%d] : %s" % (counter, roi)) quit() # set grouping variables if opts.groupingvariable: if len(opts.groupingvariable) > 1: pdCSV = russiandolls(opts.groupingvariable, pdCSV) groupVar = 'group_list' else: groupVar = opts.groupingvariable[0] # stats functions if opts.outstats: if not opts.statsmodel: print("A statistical model must be specificed. -m {model}") quit() if not opts.range: print("Range must be specfied. -r {start} {stop}") quit() elif len(opts.range) != 2: print("Range must have start and stop. -r {start} {stop}") quit() else: roi_names = [] t_values = [] p_values = [] icc_values = [] if not opts.exogenousvariables: print( "The exogenous (independent) variables must be specifice. e.g., -exog pred1 pred2 age" ) quit() if opts.mediation: medvars = ['%s' % opts.mediation[1], '%s' % opts.mediation[2]] exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV) # build null array pdCSV = omitmissing(pdDF=pdCSV, endog_range=opts.range, exogenous=strip_ones(exog_vars), groups=medvars) if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm': pdCSV = omitmissing(pdDF=pdCSV, endog_range=opts.range, groups=opts.groupingvariable) # rebuild exog_vars with correct length exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV, opts.scaleexog == True) leftvar = pdCSV[opts.mediation[1]] rightvar = pdCSV[opts.mediation[2]] y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1] if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm': t_valuesA = [] t_valuesB = [] ################ MM mediation ################ if opts.mediation[0] == 'I': EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((leftvar, rightvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) #pathA for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]], EXOG_A, pdCSV[groupVar]).fit() roi_names.append(pdCSV.columns[i]) t_valuesA.append(mdl_fit.tvalues[1]) #pathB for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]], EXOG_B, pdCSV[groupVar]).fit() t_valuesB.append(mdl_fit.tvalues[1]) elif opts.mediation[0] == 'M': EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((rightvar, leftvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) #pathA for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]], EXOG_A, pdCSV[groupVar]).fit() roi_names.append(pdCSV.columns[i]) t_valuesA.append(mdl_fit.tvalues[1]) #pathB for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]], EXOG_B, pdCSV[groupVar]).fit() t_valuesB.append(mdl_fit.tvalues[1]) else: EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((rightvar, leftvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) #pathA mdl_fit = sm.MixedLM(rightvar, EXOG_A, pdCSV[groupVar]).fit() t_valuesA = mdl_fit.tvalues[1] #pathB for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(pdCSV[pdCSV.columns[i]], exog_vars, pdCSV[groupVar]).fit() roi_names.append(pdCSV.columns[i]) t_valuesB.append(mdl_fit.tvalues[1]) z_values = special_calc_sobelz(np.array(t_valuesA), np.array(t_valuesB), alg="aroian") p_values = norm.sf(abs(z_values)) p_FDR = multipletests(p_values, method='fdr_bh')[1] else: ################ LM mediation ################ if opts.mediation[0] == 'I': EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((leftvar, rightvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1] #pathA t_valuesA = full_glm_results(y, EXOG_A, only_tvals=True)[1, :] #pathB t_valuesB = full_glm_results(y, EXOG_B, only_tvals=True)[1, :] elif opts.mediation[0] == 'M': EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((rightvar, leftvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1] #pathA t_valuesA = full_glm_results(y, EXOG_A, only_tvals=True)[1, :] #pathB t_valuesB = full_glm_results(y, EXOG_B, only_tvals=True)[1, :] elif opts.mediation[0] == 'Y': EXOG_A = sm.add_constant( np.column_stack((leftvar, strip_ones(exog_vars)))) EXOG_B = np.column_stack((rightvar, leftvar)) EXOG_B = sm.add_constant( np.column_stack((EXOG_B, strip_ones(exog_vars)))) y = pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1] #pathA t_valuesA = sm.OLS(rightvar, EXOG_A).fit().tvalues[1] #pathB t_valuesB = full_glm_results(y, EXOG_B, only_tvals=True)[1, :] else: print("Error: Invalid mediation type.") quit() z_values = special_calc_sobelz(np.array(t_valuesA), np.array(t_valuesB), alg="aroian") p_values = norm.sf(abs(z_values)) p_FDR = multipletests(p_values, method='fdr_bh')[1] if opts.permutation: if opts.groupingvariable: p_FWER = run_permutations_med( endog_arr=y, exog_vars=exog_vars, medtype=opts.mediation[0], leftvar=leftvar, rightvar=rightvar, num_perm=int(opts.permutation[0]), stat_arr=z_values, uniq_groups=pdCSV[groupVar], return_permutations=True) else: p_FWER = run_permutations_med( endog_arr=y, exog_vars=exog_vars, medtype=opts.mediation[0], leftvar=leftvar, rightvar=rightvar, num_perm=int(opts.permutation[0]), stat_arr=z_values, uniq_groups=None, return_permutations=True) roi_names = [] for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): roi_names.append(pdCSV.columns[i]) columnnames = [] columnnames.append('Zval') columnnames.append('pval') columnnames.append('pFDR') columndata = np.column_stack((z_values, p_values)) columndata = np.column_stack((columndata, p_FDR)) if opts.permutation: columnnames.append('pFWER') columndata = np.column_stack((columndata, p_FWER)) pd_DF = pd.DataFrame(data=columndata, index=roi_names, columns=columnnames) pd_DF.to_csv(opts.outstats[0], index_label='ROI') else: ################ MIXED MODEL ################ if opts.statsmodel == 'mixedmodel' or opts.statsmodel == 'mm': exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV) # build null array pdCSV = omitmissing(pdDF=pdCSV, endog_range=opts.range, exogenous=strip_ones(exog_vars), groups=opts.groupingvariable) # rebuild exog_vars with correct length if opts.scaleexogwithingroup: exog_vars = create_exog_mat( opts.exogenousvariables, pdCSV, opts.scaleexog == True, scale_groups=pdCSV[groupVar]) else: exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV, opts.scaleexog == True) exog_re = None if opts.exogintercept: exog_re = dmatrix("1+%s" % opts.exogintercept[0], pdCSV) for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): mdl_fit = sm.MixedLM(endog=pdCSV[pdCSV.columns[i]], exog=exog_vars, groups=pdCSV[groupVar], exog_re=exog_re).fit() roi_names.append(pdCSV.columns[i]) t_values.append(mdl_fit.tvalues[1:]) p_values.append(mdl_fit.pvalues[1:]) icc_values.append( np.array(mdl_fit.cov_re / (mdl_fit.cov_re + mdl_fit.scale))) if opts.plotresids: os.system('mkdir -p resid_plots') plot_residuals( residual=mdl_fit.resid, fitted=mdl_fit.fittedvalues, basename=('%s_mm_%s' % (str(i).zfill(4), pdCSV.columns[i])), outdir='resid_plots/') p_values = np.array(p_values) t_values = np.array(t_values) p_FDR = np.zeros_like(p_values) p_values[np.isnan(p_values)] = 1 for col in range(p_FDR.shape[1]): p_FDR[:, col] = multipletests(p_values[:, col], method='fdr_bh')[1] columnnames = [] for colname in opts.exogenousvariables: columnnames.append('tval_%s' % colname) if opts.exogintercept: columnnames.append('tval_re1') columnnames.append('tval_re1Xre2') columnnames.append('tval_re2') else: columnnames.append('tval_groupRE') for colname in opts.exogenousvariables: columnnames.append('pval_%s' % colname) if opts.exogintercept: columnnames.append('pval_re1') columnnames.append('pval_re1Xre2') columnnames.append('pval_re2') else: columnnames.append('pval_groupRE') for colname in opts.exogenousvariables: columnnames.append('pFDR_%s' % colname) if opts.exogintercept: columnnames.append('pFDR_re1') columnnames.append('pFDR_re1Xre2') columnnames.append('pFDR_re2') else: columnnames.append('pFDR_groupRE') if not opts.exogintercept: columnnames.append('ICC_groupRE') columndata = np.column_stack((t_values, p_values)) columndata = np.column_stack((columndata, p_FDR)) if not opts.exogintercept: columndata = np.column_stack( (columndata, np.array(icc_values).flatten())) pd_DF = pd.DataFrame(data=columndata, index=roi_names, columns=columnnames) pd_DF.to_csv(opts.outstats[0], index_label='ROI') else: ################ LINEAR MODEL ################ exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV) # build null array pdCSV = omitmissing(pdDF=pdCSV, endog_range=opts.range, exogenous=strip_ones(exog_vars)) # rebuild exog_vars with correct length if opts.scaleexogwithingroup: exog_vars = create_exog_mat( opts.exogenousvariables, pdCSV, opts.scaleexog == True, scale_groups=pdCSV[groupVar]) else: exog_vars = create_exog_mat(opts.exogenousvariables, pdCSV, opts.scaleexog == True) y = np.array( pdCSV.iloc[:, int(opts.range[0]):int(opts.range[1]) + 1]) if opts.plotresids: f_values, t_values, p_values, R2, R2_adj, resids, fitted = full_glm_results( y, exog_vars, return_resids=True) else: np.savetxt('temp_int.csv', orthog_columns(strip_ones(exog_vars)), delimiter=',') f_values, t_values, p_values, R2, R2_adj = full_glm_results( y, exog_vars) if opts.permutation: if opts.groupingvariable: p_FWER = run_permutations( endog_arr=y, exog_vars=exog_vars, num_perm=int(opts.permutation[0]), stat_arr=t_values, uniq_groups=pdCSV[groupVar], return_permutations=True) else: p_FWER = run_permutations(endog_arr=y, exog_vars=exog_vars, num_perm=int( opts.permutation[0]), stat_arr=t_values, uniq_groups=None, return_permutations=True) p_FWER = p_FWER[1:, :].T t_values = t_values[1:, :].T # ignore intercept p_values = p_values[1:, :].T # ignore intercept roi_names = [] for i in xrange(int(opts.range[0]), int(opts.range[1]) + 1): roi_names.append(pdCSV.columns[i]) p_FDR = np.zeros_like(p_values) p_values[np.isnan(p_values)] = 1 for col in range(p_FDR.shape[1]): p_FDR[:, col] = multipletests(p_values[:, col], method='fdr_bh')[1] columnnames = [] columnnames.append('Fvalue') columnnames.append('R2') columnnames.append('R2adj') for colname in opts.exogenousvariables: columnnames.append('tval_%s' % colname) for colname in opts.exogenousvariables: columnnames.append('pval_%s' % colname) for colname in opts.exogenousvariables: columnnames.append('pFDR_%s' % colname) columndata = np.column_stack((f_values[:, np.newaxis], R2)) columndata = np.column_stack((columndata, R2_adj)) columndata = np.column_stack((columndata, t_values)) columndata = np.column_stack((columndata, p_values)) columndata = np.column_stack((columndata, p_FDR)) if opts.permutation: for colname in opts.exogenousvariables: columnnames.append('pFWER_%s' % colname) columndata = np.column_stack((columndata, p_FWER)) pd_DF = pd.DataFrame(data=columndata, index=roi_names, columns=columnnames) pd_DF.to_csv(opts.outstats[0], index_label='ROI') if opts.plotresids: os.system('mkdir -p resid_plots') for i, roi in enumerate(np.array(roi_names)): plot_residuals( residual=resids[:, i], fitted=fitted[:, i], basename=( '%s_lm_%s' % (str(i + int(opts.range[0])).zfill(4), roi)), outdir='resid_plots/') if opts.savecsv: pdCSV.to_csv(opts.savecsv[0])
vc = {'x': '0 + x'} md = smf.mixedlm("y ~ 1 + x", test_df, groups = test_df["unit"], vc_formula = vc, re_formula = "~ 1") # random intercept mdf = md.fit() mdf.summary() mdf.scale reffs = mdf.random_effects smf_b_i = [reffs[i][1] for i in range(1, 31)] smf_beta_i = mdf.params['x'] + smf_b_i md2 = sm.MixedLM(test_df["y"], test_df[["Intercept", "x"]], groups = test_df["unit"], exog_re = test_df[["Intercept", "x"]]) mdf2 = md2.fit() mdf2.summary() # Stage2: Trying to get it to work with keras enc = OneHotEncoder() enc.fit(test_df['unit'].values.reshape(-1, 1)) unit_onehot = enc.transform(test_df['unit'].values.reshape(-1, 1)) unit_onehot = unit_onehot.toarray() # random intercepts design unit_x = np.dot(np.diag(test_df['x']), unit_onehot) # random coefs design n_units = unit_onehot.shape[1]
return mat, colnames # Then we set up the variance components using the VCSpec class. vcm = df.groupby("group1").apply(f).to_list() mats = [x[0] for x in vcm] colnames = [x[1] for x in vcm] names = ["group2"] vcs = VCSpec(names, [colnames], [mats]) # Finally we fit the model. It can be seen that the results of the # two fits are identical. oo = np.ones(df.shape[0]) model2 = sm.MixedLM(df.y, oo, exog_re=oo, groups=df.group1, exog_vc=vcs) result2 = model2.fit() print(result2.summary()) # ## Crossed analysis # In a crossed analysis, the levels of one group can occur in any # combination with the levels of the another group. The groups in # Statsmodels MixedLM are always nested, but it is possible to fit a # crossed model by having only one group, and specifying all random # effects as variance components. Many, but not all crossed models # can be fit in this way. The function below generates a crossed data # set with two levels of random structure. def generate_crossed(n_group1=100,
csvfile = '..\\TreeNobXdate.csv' col = 0 #Use first column from file SurveyYear = 2005 MaxYear = 1980 MinYear = 1875 #Generate an age-frequency. Oldest-ages first data = GetAgeFreq(csvfile, col=col, SurveyYear=SurveyYear, MaxYear=MaxYear, MinYear=MinYear) nyear = len(data) #Add a column of ones to represent the constant yb = array([array([int(1), int(t)]) for t in range(nyear)]) glm = sm.GLM(data, yb, family=sm.families.Poisson(), links=sm.families.links.Log()) print(glm.fit().summary()) print('#############') groups = [t for t in range(nyear)] MLM = sm.MixedLM(data, yb, groups=groups, family=sm.families.Poisson(), links=sm.families.links.Log()) print(MLM.fit().summary()) print('#############')
def mixed_linear_modeling(df, x='bin', bic_diff=10, df_sims=None, colors=None): fig = plt.figure(figsize=(1.1 * len(df['variable'].unique()), 1.5)) plt_nr = 1 for param in df['variable'].unique(): data = df.loc[df['variable'] == param, :] ax = fig.add_subplot(1, len(df['variable'].unique()), plt_nr) # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=None, data=df) # sns.barplot(x='variable', y='value', hue='bin', units='subj_idx', palette='Reds', ci=66, data=df) kwargs = { 'linewidths': 0, 'markeredgewidth': 0.5, 'markeredgecolor': 'black', 'ecolor': 'black' } if ('level' in data.columns) & ~(x == 'level'): sns.pointplot(x=x, y='value', hue='level', units='subj_idx', join=False, ci=66, scale=0.50, errwidth=1, palette='Greys', data=data, zorder=1, **kwargs) else: sns.pointplot(x=x, y='value', units='subj_idx', join=False, ci=66, scale=0.66, errwidth=1, color='grey', data=data, zorder=1, **kwargs) # sns.stripplot(x='variable', y='value', hue='bin', color='grey', size=2, jitter=False, dodge=True, data=df) # locs = np.sort(np.array([p.get_x() + p.get_width() / 2. for p in ax.patches])) if param == 'rt': plt.ylim(data['value'].mean() - 0.1, data['value'].mean() + 0.1) if len(data[x].unique()) > 2: # variables: data['intercept'] = 1 data.loc[:, '{}_^2'.format(x)] = np.array(data.loc[:, x]**2) # # zscore: # for subj in data['subj_idx'].unique(): # ind = data['subj_idx']==subj # data.loc[ind,x] = (data.loc[ind,x] - data.loc[ind,x].mean()) / data.loc[ind,x].std() # data.loc[ind,'{}_^2'.format(x)] = (data.loc[ind,'{}_^2'.format(x)] - data.loc[ind,'{}_^2'.format(x)].mean()) / data.loc[ind,'{}_^2'.format(x)].std() endog = data.loc[:, 'value'].astype(float) if ('level' in data.columns) & ~(x == 'level'): exog1 = data.loc[:, ['intercept', 'level', x]].astype(float) exog2 = data.loc[:, ['intercept', 'level', x, '{}_^2'. format(x)]].astype(float) else: exog1 = data.loc[:, ['intercept', x]].astype(float) exog2 = data.loc[:, ['intercept', x, '{}_^2'.format(x)]].astype( float) # comparison: try: md1 = sm.MixedLM(endog, exog1, data.loc[:, 'subj_idx'], exog_re=exog1) mdf1 = md1.fit(reml=False) md2 = sm.MixedLM(endog, exog2, data.loc[:, 'subj_idx'], exog_re=exog2) mdf2 = md2.fit(reml=False) if mdf1.converged & mdf2.converged: random = True else: md1 = sm.MixedLM( endog, exog1, data.loc[:, 'subj_idx'], ) mdf1 = md1.fit(reml=False) md2 = sm.MixedLM( endog, exog2, data.loc[:, 'subj_idx'], ) mdf2 = md2.fit(reml=False) random = False if (mdf1.bic - mdf2.bic) > bic_diff: exog = exog2.copy() else: exog = exog1.copy() # refit with reml: if random: mdf = sm.MixedLM(endog, exog, groups=data.loc[:, 'subj_idx'], exog_re=exog).fit() else: mdf = sm.MixedLM(endog, exog, groups=data.loc[:, 'subj_idx']).fit() print(mdf.summary()) xx = np.sort(np.array([p.get_data()[0][0] for p in ax.lines])) if ('level' in data.columns) & ~(x == 'level'): if (mdf1.bic - mdf2.bic) > bic_diff: yy = np.concatenate([ mdf.params['intercept'] + (np.array(exog.groupby('level').mean().index) * mdf.params['level']) + (b * mdf.params[x]) + ((b**2) * mdf.params['{}_^2'.format(x)]) for b in np.array(exog.groupby(x).mean().index) ]) plt.title('p = {}\np1 = {}\np2 = {}'.format( round(mdf.pvalues['level'], 3), round(mdf.pvalues[x], 3), round(mdf.pvalues['{}_^2'.format(x)], 3)), size=6) else: yy = np.concatenate([ mdf.params['intercept'] + (np.array(exog.groupby('level').mean().index) * mdf.params['level']) + (b * mdf.params[x]) for b in np.array(exog.groupby(x).mean().index) ]) plt.title('p = {}\np = {}'.format( round(mdf.pvalues['level'], 3), round(mdf.pvalues[x], 3)), size=6) for v in exog.groupby('level').mean().index: plt.plot( xx[int(v)::len(exog.groupby('level').mean().index )], yy[int(v)::len(exog.groupby('level').mean().index )], lw=1, color='black') else: if (mdf1.bic - mdf2.bic) > bic_diff: yy = mdf.params['intercept'] + (np.array( exog.groupby(x).mean().index) * mdf.params[x]) + ( (np.array(exog.groupby(x).mean().index)**2) * mdf.params['{}_^2'.format(x)]) plt.title('p1 = {}\np2 = {}'.format( round(mdf.pvalues[x], 3), round(mdf.pvalues['{}_^2'.format(x)], 3)), size=6) else: yy = mdf.params['intercept'] + (np.array( exog.groupby(x).mean().index) * mdf.params[x]) plt.title('p = {}'.format(round(mdf.pvalues[x], 3)), size=6) plt.plot(xx, yy, lw=1, color='black') except: pass else: t, p = sp.stats.ttest_rel(data.loc[data[x] == 0, 'value'], data.loc[data[x] == 1, 'value']) plt.title('p = {}'.format(round(p, 3)), size=6) if not df_sims is None: if ('level' in data.columns) & ~(x == 'level'): for df_sim, color in zip(df_sims, colors): sns.pointplot( x=x, y='value', hue='level', palette=[ 'blue' for _ in range(len(data['level'].unique())) ], join=False, ci=None, markers='x', scale=0.66, data=df_sim.loc[df['variable'] == param, :], zorder=100) else: for df_sim, color in zip(df_sims, colors): sns.pointplot(x=x, y='value', color='blue', join=False, ci=None, markers='x', scale=0.66, data=df_sim.loc[df['variable'] == param, :], zorder=100) try: plt.gca().get_legend().remove() except: pass plt.xticks(ax.get_xticks(), list(np.array(ax.get_xticks(), dtype=int))) plt.ylabel(param) plt_nr += 1 sns.despine(offset=2, trim=True) plt.tight_layout() return fig
# for ind in xrange(y_test.shape[0]): # # print ind, fake_id, (ind+1) % NUM_FRAMES # X_test_fake_song_ids[ind] = fake_id # if (ind+1) % NUM_FRAMES == 0: # fake_id += 1 ### add column of ones to data to account for the bias: # X_train = add_intercept(X_train) # print X_train.shape # print X_train[0:10] # Fit regression model # cf http://statsmodels.sourceforge.net/devel/mixed_linear.html # md = smf.mixedlm(y_train, X_train, groups=X_train_fake_song_ids) # md = sm.MixedLM(y_train, X_train, exog_re=X_train_fake_time, groups=X_train_fake_song_ids, use_sqrt=True) md = sm.MixedLM(y_train, X_train, groups=X_train_fake_song_ids, use_sqrt=True) mdf = md.fit() print mdf.summary() # X_test = add_intercept(X_test) pred = mdf.predict(X_test) # print pred pred = list() # predict each song separately and append predictions for ind_song in xrange(nb_test_song): deb = ind_song * NUM_FRAMES fin = deb + NUM_FRAMES pred_song = mdf.predict(X_test[deb:fin, :]) pred.append(pred_song)