def main_function((individ, X_df, df, objective_str, trgt)): """ Function to calculate BIC or AIC of model Input: individual defining model parameters to be used, dataframe with independent variables, dataframe with all data, metric to be used (BIC or AIC), dependent variable string Output: mutate individual """ #---------------------------------------------------------------------- # Import necessary modules import numpy as np from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects.packages import STAP #---------------------------------------------------------------------- # Remove variables from dataframe that are not part of the individual's genome vars_lst = list(X_df.columns) # Get list of the available variables gen_ind_0 = np.where(individ == 0) # Find which elements are set to 0 gen_ind_0 = gen_ind_0[0] # Get indices of variables to be removed vars_lst2 = vars_lst[:] # Copy list of all variables # Remove variables for i in sorted(gen_ind_0, reverse=True): del vars_lst2[i] #---------------------------------------------------------------------- # Fit model in R # Create formula to be used in R myString = "+".join(vars_lst2) stable_str = 'as.ordered(' + trgt + ') ~ ' formula = stable_str + myString # Transform Pandas dataframe to R rdf = pandas2ri.py2ri(df) # Define R function as string string = """ mdl_func <- function(formula,df) { library(VGAM) mdl1=vglm(formula,family=propodds, data=df) ll=logLik(mdl1) return(ll) } """ ord_ll = STAP(string, "ord_ll") # Calculate AIC and BIC based on LogLikelihood (ll_) try: ll_ = ord_ll.mdl_func(formula, rdf) ll_ = ll_[0] # In case LogLikelihood calculation fails except: ll_ = -1000.0 k = float(len(vars_lst2)) n = float(len(df)) aic_ = (2.0 * k) - (2 * ll_) bic_ = (np.log(n) * k) - (2 * ll_) # Return AIC or BIC depending on used choice if objective_str == 'aic': obj_ = aic_ elif objective_str == 'bic': obj_ = bic_ else: obj_ = np.nan # Return optimisation metric return obj_, individ
def mdl_fit(model_vars, df, y_param, ci_level=0.95): """ Function to fit final model and extract modelling statistics Input: model variables as a list, dataframe holding all the data, dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% Output: dataframe with model coefficients and statistics """ #---------------------------------------------------------------------- # Import necessary modules import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() import numpy as np from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects.packages import STAP import scipy.stats as stats #---------------------------------------------------------------------- # Fit R model # Set R function as string to fit model and return results string_ord_mdl = """ mdl_func <- function(formula,df) { library(VGAM) mdl1=vglm(formula,family=propodds, data=df) ll=logLik(mdl1) coefficients_df=coef(summary(mdl1)) coefficient_cols=colnames(coefficients_df) coefficient_rows=rownames(coefficients_df) output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows) return(output) } """ # Transform pandas dataframe to R format rdf = pandas2ri.py2ri(df) # Set R formula as string using the model parameters and dependent variable formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars) # Define R function to be used in Python ord_ll = STAP(string_ord_mdl, "ord_ll") # Fit model output_R = ord_ll.mdl_func(formula, rdf) # Extract data and place them in Pandas dataframe coeff_df_temp = output_R[1] coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp) cols_df = list(output_R[2]) rows_df = list(output_R[3]) coeff_df.columns = cols_df coeff_df.index = rows_df #---------------------------------------------------------------------- # Calculate statistics # Number of parameters n_vars = len(coeff_df) # Degrees for freedom for t-distribution deg_free = len(df) - n_vars # Calculate alpha value from confidence interval alpha_ = 1.0 - ci_level # array to hold the low % confidence intervals low_arr = np.zeros(len(coeff_df)) # array to hold the high % confidence intervals high_arr = np.zeros(len(coeff_df)) # array to hold the Wald test p-values p_val_arr = np.zeros(len(coeff_df)) # array to hold the t statistic t_value_arr = np.zeros(len(coeff_df)) # loop counter variable index_arr = 0 for index, row in coeff_df.iterrows(): # Get standard error for variable coefficient from R model fit data std_error = row['Std. Error'] # Get variable coefficient value from R model fit data coeff_value = row['Estimate'] # Calculate t_critical statistic for desired confidence interval t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free) # Calculate low - high confidence interval limits low_arr[index_arr] = coeff_value - (t_critical * std_error) high_arr[index_arr] = coeff_value + (t_critical * std_error) # t statistic calculation to get p-value t_value = coeff_value / std_error t_value_arr[index_arr] = t_value # Calculate p-value p_val_arr[index_arr] = 2.0 * \ (1.0 - stats.t.cdf(np.abs(t_value), deg_free)) index_arr += 1 # Set arrays to dataframe columns coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr coeff_df['P Value'] = p_val_arr coeff_df['t Value'] = t_value_arr # Delete statistics of R model fit referring to normal distribution coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True) # Return dataframe with model fit coefficients and statistics return coeff_df