def _calc_var(df: pyspark.sql.DataFrame, label_col: str) -> pd.DataFrame: r"""calculate variance for each column that isnt the label_col Parameters ---------- df : pyspark.sql.DataFrame df where rows are observations, all columns except `label_col` are predictors. label_col : str Returns ------- bias_df : pd.DataFrame pandas dataframe where predictors are index and only column is variance Raises ------ UncaughtExceptions Notes ----- """ pred_cols = [x for x in df.columns if x != label_col] s_var_df = df.groupby(label_col).agg({x: 'variance' for x in pred_cols }).toPandas().transpose() s_var_df = s_var_df.reset_index() s_var_df['index'] = s_var_df['index'].str.replace(r')', '').str.replace( r'variance\(', '') s_var_df = s_var_df.set_index('index') s_var_df.columns = ["var_{0}".format(x) for x in s_var_df.columns] s_var_df = s_var_df.loc[s_var_df.index != 'label', :] return s_var_df
def impact(df: pyspark.sql.DataFrame, response_col: str, prob_mod: mlc.Model) -> Tuple[float, float, float]: r"""observe impact of treatment on response variable currently response must be binary if the df is small enough return naive difference in groupby label response mean. otherwise do additional regression on response col with label as predictor and use its coefficient as a measure of its impact. binning and dimensionality reduction will occur if necessary to do an effective regression Parameters ---------- df: pyspark.sql.DataFrame response_col: str prob_mod: Tmlc.Model propensity model, mostly used to keep track of feature_col, label_col, pred_cols Returns ------- treatment_rate : float treatment response rate control_rate : float control response rate adjusted_response : float impact of treatment on response, which may be `control_rate`-`treatment_rate` or may have further bias adjustement Raises ------ ValueError when number of rows is less than `MINIMUM_POS_COUNT`*2 UncaughtExceptions See Also -------- bin_features _reduce_dimensionality Notes ----- """ _persist_if_unpersisted(df) label_col = prob_mod.getOrDefault('labelCol') features_col = prob_mod.getOrDefault('featuresCol') pred_cols = _get_pred_cols(df, features_col) all_count = df.count() # safety check if all_count < MINIMUM_POS_COUNT * 2: logging.getLogger(__name__).critical( "somehow have less than 2*MINIMUM_POS_COUNT*2 rows") raise ValueError( "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening" ) # dict because 1, 0 for label col are not guaranteed to be ordered naive_response_dict = dict() response_list = df.groupby(label_col).mean(response_col).collect() naive_response_dict[response_list[0][label_col]] = response_list[0][ "avg({col})".format(col=response_col)] naive_response_dict[response_list[1][label_col]] = response_list[1][ "avg({col})".format(col=response_col)] treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[ 0] logging.getLogger(__name__).info( "treatment_rate:{tr:.2f} control_rate:{cr:.2f}".format( tr=treatment_rate, cr=control_rate)) # return early if additional bias reduction is not applicable if all_count < NAIVE_THRESHOLD_COUNT: logging.getLogger(__name__).info( "additional bias adjustment inapplicable, returning naive difference" ) return treatment_rate, control_rate, (control_rate - treatment_rate) logging.getLogger(__name__).info("additional bias adjustment possible") # choose fewer features if appropriate to prevent overfit. round down num_preds = int( df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1 logging.getLogger(__name__).info( "need max {n:,} predictors".format(n=num_preds)) if num_preds < len(list(pred_cols)): logging.getLogger(__name__).info( "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality" .format(np=num_preds, ep=len(pred_cols))) kwargs = { 'df': df, 'label_col': label_col, 'binned_features_col': features_col, 'ncols': num_preds } df, pred_cols = reduce_dimensionality(args=kwargs, method='chi') pred_cols_r = pred_cols + [label_col] assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r, outputCol='features_r') df = assembler_r.transform(df) _persist_if_unpersisted(df) lre_r = mlc.LogisticRegression( featuresCol='features_r', labelCol=response_col, predictionCol='prediction_{0}'.format(response_col), rawPredictionCol='rawPrediction_{0}'.format(response_col), probabilityCol='probability_{0}'.format(response_col)) lrm_r = lre_r.fit(df) coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients)) adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col])) logging.getLogger(__name__).info( "bias asjusted response is {ar:.2f}".format(ar=adjusted_response)) return treatment_rate, control_rate, adjusted_response