Ejemplo n.º 1
0
def main_function((individ, X_df, df, objective_str, trgt)):
    """
    Function to calculate BIC or AIC of model
    Input: individual defining model parameters to be used, 
    dataframe with independent variables, dataframe with all data,
    metric to be used (BIC or AIC), dependent variable string
    Output: mutate individual
    """
    #----------------------------------------------------------------------
    # Import necessary modules
    import numpy as np
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    from rpy2.robjects.packages import STAP
    #----------------------------------------------------------------------
    # Remove variables from dataframe that are not part of the individual's genome
    vars_lst = list(X_df.columns)  # Get list of the available variables
    gen_ind_0 = np.where(individ == 0)  # Find which elements are set to 0
    gen_ind_0 = gen_ind_0[0]  # Get indices of variables to be removed
    vars_lst2 = vars_lst[:]  # Copy list of all variables
    # Remove variables
    for i in sorted(gen_ind_0, reverse=True):
        del vars_lst2[i]
    #----------------------------------------------------------------------
    # Fit model in R
    # Create formula to be used in R
    myString = "+".join(vars_lst2)
    stable_str = 'as.ordered(' + trgt + ') ~ '
    formula = stable_str + myString
    # Transform Pandas dataframe to R
    rdf = pandas2ri.py2ri(df)
    # Define R function as string
    string = """
    mdl_func <- function(formula,df) {
            library(VGAM)
            mdl1=vglm(formula,family=propodds, data=df)  
            ll=logLik(mdl1)    	
        return(ll)
    }
    """
    ord_ll = STAP(string, "ord_ll")
    # Calculate AIC and BIC based on LogLikelihood (ll_)
    try:
        ll_ = ord_ll.mdl_func(formula, rdf)
        ll_ = ll_[0]
    # In case LogLikelihood calculation fails
    except:
        ll_ = -1000.0
    k = float(len(vars_lst2))
    n = float(len(df))
    aic_ = (2.0 * k) - (2 * ll_)
    bic_ = (np.log(n) * k) - (2 * ll_)
    # Return AIC or BIC depending on used choice
    if objective_str == 'aic':
        obj_ = aic_
    elif objective_str == 'bic':
        obj_ = bic_
    else:
        obj_ = np.nan
    # Return optimisation metric
    return obj_, individ
Ejemplo n.º 2
0
def mdl_fit(model_vars, df, y_param, ci_level=0.95):
    """
    Function to fit final model and extract modelling statistics
    Input: model variables as a list, dataframe holding all the data, 
    dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% 
    Output: dataframe with model coefficients and statistics   
    """
    #----------------------------------------------------------------------
    # Import necessary modules
    import rpy2.robjects.numpy2ri
    rpy2.robjects.numpy2ri.activate()
    import numpy as np
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    from rpy2.robjects.packages import STAP
    import scipy.stats as stats
    #----------------------------------------------------------------------
    # Fit R model
    # Set R function as string to fit model and return results
    string_ord_mdl = """
    mdl_func <- function(formula,df) {
    	library(VGAM)
    	mdl1=vglm(formula,family=propodds, data=df)
    
    	ll=logLik(mdl1)
        coefficients_df=coef(summary(mdl1))
        coefficient_cols=colnames(coefficients_df)
        coefficient_rows=rownames(coefficients_df)
    	output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows)
        return(output)
    }
        """
    # Transform pandas dataframe to R format
    rdf = pandas2ri.py2ri(df)
    # Set R formula as string using the model parameters and dependent variable
    formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars)
    # Define R function to be used in Python
    ord_ll = STAP(string_ord_mdl, "ord_ll")
    # Fit model
    output_R = ord_ll.mdl_func(formula, rdf)
    # Extract data and place them in Pandas dataframe
    coeff_df_temp = output_R[1]
    coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp)
    cols_df = list(output_R[2])
    rows_df = list(output_R[3])
    coeff_df.columns = cols_df
    coeff_df.index = rows_df
    #----------------------------------------------------------------------
    # Calculate statistics
    # Number of parameters
    n_vars = len(coeff_df)
    # Degrees for freedom for t-distribution
    deg_free = len(df) - n_vars
    # Calculate alpha value from confidence interval
    alpha_ = 1.0 - ci_level
    # array to hold the low % confidence intervals
    low_arr = np.zeros(len(coeff_df))
    # array to hold the high % confidence intervals
    high_arr = np.zeros(len(coeff_df))
    # array to hold the Wald test p-values
    p_val_arr = np.zeros(len(coeff_df))
    # array to hold the t statistic
    t_value_arr = np.zeros(len(coeff_df))
    # loop counter variable
    index_arr = 0
    for index, row in coeff_df.iterrows():
        # Get standard error for variable coefficient from R model fit data
        std_error = row['Std. Error']
        # Get variable coefficient value from R model fit data
        coeff_value = row['Estimate']
        # Calculate t_critical statistic for desired confidence interval
        t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free)
        # Calculate low - high confidence interval limits
        low_arr[index_arr] = coeff_value - (t_critical * std_error)
        high_arr[index_arr] = coeff_value + (t_critical * std_error)
        # t statistic calculation to get p-value
        t_value = coeff_value / std_error
        t_value_arr[index_arr] = t_value
        # Calculate p-value
        p_val_arr[index_arr] = 2.0 * \
            (1.0 - stats.t.cdf(np.abs(t_value), deg_free))
        index_arr += 1
    # Set arrays to dataframe columns
    coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr
    coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr
    coeff_df['P Value'] = p_val_arr
    coeff_df['t Value'] = t_value_arr
    # Delete statistics of R model fit referring to normal distribution
    coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True)
    # Return dataframe with model fit coefficients and statistics
    return coeff_df