def impact(df: pyspark.sql.DataFrame, response_col: str, prob_mod: mlc.Model) -> Tuple[float, float, float]: r"""observe impact of treatment on response variable currently response must be binary if the df is small enough return naive difference in groupby label response mean. otherwise do additional regression on response col with label as predictor and use its coefficient as a measure of its impact. binning and dimensionality reduction will occur if necessary to do an effective regression Parameters ---------- df: pyspark.sql.DataFrame response_col: str prob_mod: Tmlc.Model propensity model, mostly used to keep track of feature_col, label_col, pred_cols Returns ------- treatment_rate : float treatment response rate control_rate : float control response rate adjusted_response : float impact of treatment on response, which may be `control_rate`-`treatment_rate` or may have further bias adjustement Raises ------ ValueError when number of rows is less than `MINIMUM_POS_COUNT`*2 UncaughtExceptions See Also -------- bin_features _reduce_dimensionality Notes ----- """ _persist_if_unpersisted(df) label_col = prob_mod.getOrDefault('labelCol') features_col = prob_mod.getOrDefault('featuresCol') pred_cols = _get_pred_cols(df, features_col) all_count = df.count() # safety check if all_count < MINIMUM_POS_COUNT * 2: logging.getLogger(__name__).critical( "somehow have less than 2*MINIMUM_POS_COUNT*2 rows") raise ValueError( "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening" ) # dict because 1, 0 for label col are not guaranteed to be ordered naive_response_dict = dict() response_list = df.groupby(label_col).mean(response_col).collect() naive_response_dict[response_list[0][label_col]] = response_list[0][ "avg({col})".format(col=response_col)] naive_response_dict[response_list[1][label_col]] = response_list[1][ "avg({col})".format(col=response_col)] treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[ 0] logging.getLogger(__name__).info( "treatment_rate:{tr:.2f} control_rate:{cr:.2f}".format( tr=treatment_rate, cr=control_rate)) # return early if additional bias reduction is not applicable if all_count < NAIVE_THRESHOLD_COUNT: logging.getLogger(__name__).info( "additional bias adjustment inapplicable, returning naive difference" ) return treatment_rate, control_rate, (control_rate - treatment_rate) logging.getLogger(__name__).info("additional bias adjustment possible") # choose fewer features if appropriate to prevent overfit. round down num_preds = int( df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1 logging.getLogger(__name__).info( "need max {n:,} predictors".format(n=num_preds)) if num_preds < len(list(pred_cols)): logging.getLogger(__name__).info( "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality" .format(np=num_preds, ep=len(pred_cols))) kwargs = { 'df': df, 'label_col': label_col, 'binned_features_col': features_col, 'ncols': num_preds } df, pred_cols = reduce_dimensionality(args=kwargs, method='chi') pred_cols_r = pred_cols + [label_col] assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r, outputCol='features_r') df = assembler_r.transform(df) _persist_if_unpersisted(df) lre_r = mlc.LogisticRegression( featuresCol='features_r', labelCol=response_col, predictionCol='prediction_{0}'.format(response_col), rawPredictionCol='rawPrediction_{0}'.format(response_col), probabilityCol='probability_{0}'.format(response_col)) lrm_r = lre_r.fit(df) coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients)) adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col])) logging.getLogger(__name__).info( "bias asjusted response is {ar:.2f}".format(ar=adjusted_response)) return treatment_rate, control_rate, adjusted_response
def _transform(df: DataFrame, prob_mod: mlc.Model, method: Optional[str], metric: Optional[str], match_kwargs: Optional[dict] = None) -> Tuple[DataFrame, dict]: r""" execute one propensity match transform based on input vars, grab match col and run through algorithm to produce matched populations. Parameters ---------- df : pyspark.sql.DataFrame Dataframe with population in question. Must have featureCol and labelCol used in `prob_mod` prob_mod : mlc.Model the model predicting the probability that the row is in class 1 in the label col. method : {'auto', 'quantile', 'assignment'} how matching occurs. auto will select according to the number of rows specified in config as `SMALL_MATCH_THRESHOLD` Quantile does stratified sampling on predicted probability. It guarantees similar population sizes and may drop some treatments non-symmetrically in order to fulfill that guarantee. match_info contains 'scale', what proportion of treatment users were used, and 'dropped', proportion of sample dropped asymmetrically. The algorithm tries to maintain a balance between sample size and bias in deciding scale and droppped metric : {'probability'} the number that is being matched. Currently only support predicted probability but may add more in the future match_kwargs : dict, optional additional kwargs for match algorithm. Returns ------- df : pyspark.sql.DataFrame df with only matched populations ( so dont overwrite your parent dataframe if you need it!) match_info : dict information about that particular match depending on the algorithm chosen. Raises ------ UncaughtExceptions NotImplementedError illegal values for `method` and `metric`. See Also -------- _get_metric _match Notes ----- """ # interpret input args: # only support quantile or assignment matching right now if match_kwargs is None: match_kwargs = {} logging.getLogger(__name__).info( "method is {method}".format(method=str(method))) if method is None: method = 'auto' logging.getLogger(__name__).info("assigning default arg 'auto'") elif method not in ['assignment', 'quantile', 'auto']: logging.getLogger(__name__).critical("invalid method argument") raise NotImplementedError( "method {method} not implemented".format(method=method)) if method == 'auto': label_col = prob_mod.getOrDefault('labelCol') _persist_if_unpersisted(df) pos_count = df.where(F.col(label_col) == 1).count() neg_count = df.where(F.col(label_col) == 0).count() if ((pos_count**2) * neg_count) <= SMALL_MATCH_THRESHOLD: method = 'assignment' logging.getLogger(__name__).info("auto method is assignment") else: method = 'quantile' logging.getLogger(__name__).info("auto method is quantile") logging.getLogger(__name__).info( "metric is {metric}".format(metric=str(metric))) if metric is None: metric = 'probability' logging.getLogger(__name__).info( "assigning default metric 'probability'") elif metric not in ['probability']: logging.getLogger(__name__).critical("invalid metric argument") raise NotImplementedError( "metric {metric} not implemented".format(metric=metric)) # step 1 calculate match metric df, metric_col = _get_metric(df, prob_mod, metric) # step 2 match df, match_info = _match(df, prob_mod, method, metric_col, match_kwargs) return df, match_info
def _assignment_match(df: DataFrame, prob_mod: mlc.Model, metric_col: str) -> Tuple[DataFrame, dict]: r"""match treatment to controls 1:1 Use Hungarian/Munkres algorithm in `metric_col` (typically probability) to find controls for your treatments with the least cost - the distance between a treatment's metric and its control's metric Parameters ---------- df: DataFrame dataframe in question, must have input columns specified by prob_mod prob_mod: mlc.Model propenisty predicting model. used here mostly to grab label/feature columns. metric col should have been constructed by another method prior metric_col: str the column values to matched on Returns ------- df new dataframe with just the matched population match_info: dict dict of potentially handy metrics from the matching process scaled: proportion of treatments used in matching init_t_mean: mean treatment val of treatment candidates init_c_can_mean: mean metric val of control candidates init_t_count: number of treatments in input init_c_can_count: number of control candidates adj_t_count: number of treatments after adjusting population size to accomodate difference in probability distribution between treatment and control. total_cost : total_cost average_cost : average_cost dropped : frac of treatments dropped unbalanced Raises ------ ValueError when the treatment population is too small and or unabalanced to produce a good match UncaughtExceptions See Also -------- _adjust_balance _make_cost_matrix _execute_assignment_match _get_assigned_rows Notes ----- In order to produce good matching, if the probability distributions are significantly different (i.e. treatment is significantly right shifted), the control candidate pop needs to be much greater. _adjust_balance achieves this by taking max num_control_candidates/(treatment_mean/control_can_mean). this will progressively make the treatment pop smaller as the right shift becomes greater. However, this created the danger of being left with a very small treatment population from which conclusions shouldnt be drawn. additionaly this method was devised, and not taken from a peer reviewed paper. This is a point which merits further investigation before it is considered canon. """ label_col = prob_mod.getOrDefault('labelCol') t_df = df.where(F.col(label_col) == 1) c_can_df = df.where(F.col(label_col) == 0) t_adjusted_df, c_can_adjusted_df, match_info = _adjust_balance( t_df, c_can_df, metric_col) t_vals = t_adjusted_df.select(metric_col).toPandas() c_can_vals = c_can_adjusted_df.select(metric_col).toPandas() cost_matrix = _make_cost_matrix( t_vals=t_vals, c_can_vals=c_can_vals, ) c_ind, t_ind, total_cost, average_cost = _execute_assignment_match( cost_matrix) match_info['total_cost'] = total_cost match_info['average_cost'] = average_cost df = _get_assigned_rows(t_ind=t_ind, t_df=t_adjusted_df, c_ind=c_ind, c_can_df=c_can_adjusted_df) logging.getLogger(__name__).info( "matched df size is {n:,}".format(n=df.count())) match_info['dropped'] = 0 return df, match_info
def _quantile_match(df: DataFrame, prob_mod: mlc.Model, metric_col: str, ntile: int = 10, quantile_error_scale: int = 5, sample_size: int = 10**5) -> Tuple[DataFrame, dict]: r"""match by stratified sampling on probability bins. guarantee similar populations. match by stratified sampling on probability bins. guarantee similar populations. may scale treatment curve down and drop treatments unevenly to uphold guarantee Parameters ---------- df : pyspark.sql.DataFrame prob_mod : pyspark.ml.classification.Model metric_col : str name of col to be matched ntile : int how many buckets to make out of the metric col and then stratify sample defaults to 10 quantile_error_scale: Union[int, float] error tolerance for calculating boundries for ntiles relativeError passed to approxQuantile is calculated as 1/ntile/quantile_error_scale in other words 1/quantile_error_scale is how much error is ok as a fraction of the bin size be cognizant of ntile, and this value, as passing a small relativeError can increase compute time dramatically defaults to 5 sample_size: Optional[int] size of sample used to calculate quantile bin boundaries no sampling if None, not recommended defauts to 10**5 Returns ------- df Explanation of anonymous return value of type ``type``. match_info : dict contains scale and dropped scale describes what proportion of the treatment group was used and dropped describes what proportion of the treatment group, after scaling, was dropped due to inadequate control candidates Explanation Raises ------ UncaughtExceptions See Also -------- _make_quantile_match_col _execute_quantile_match Notes ----- """ logging.getLogger(__name__).info( "starting _quantile_match with args ntile={ntile}, quantile_error_scale={qes}, /" "sample_size={sn}".format(ntile=ntile, qes=quantile_error_scale, sn=sample_size)) label_col = prob_mod.getOrDefault('labelCol') df, match_col = _make_quantile_match_col(df, metric_col, label_col, ntile, quantile_error_scale, sample_size) df, match_info = _execute_quantile_match(df, match_col, label_col) match_info['type'] = 'quantile' return df, match_info