Ejemplo n.º 1
0
def impact(df: pyspark.sql.DataFrame, response_col: str,
           prob_mod: mlc.Model) -> Tuple[float, float, float]:
    r"""observe impact of treatment on response variable

    currently response must be binary
    if the df is small enough return naive difference in groupby label
    response mean. otherwise do additional regression on response col
    with label as predictor and use its coefficient as a measure of its
    impact. binning and dimensionality reduction will occur if necessary
    to do an effective regression

    Parameters
    ----------
    df: pyspark.sql.DataFrame
    response_col: str
    prob_mod: Tmlc.Model
        propensity model, mostly used to keep track of feature_col,
        label_col, pred_cols

    Returns
    -------
    treatment_rate : float
        treatment response rate
    control_rate : float
        control response rate
    adjusted_response : float
        impact of treatment on response, which may be
        `control_rate`-`treatment_rate` or may have further bias adjustement

    Raises
    ------
    ValueError
        when number of rows is less than `MINIMUM_POS_COUNT`*2
    UncaughtExceptions

    See Also
    --------
    bin_features
    _reduce_dimensionality

    Notes
    -----

    """

    _persist_if_unpersisted(df)

    label_col = prob_mod.getOrDefault('labelCol')
    features_col = prob_mod.getOrDefault('featuresCol')
    pred_cols = _get_pred_cols(df, features_col)

    all_count = df.count()

    # safety check
    if all_count < MINIMUM_POS_COUNT * 2:
        logging.getLogger(__name__).critical(
            "somehow have less than 2*MINIMUM_POS_COUNT*2 rows")
        raise ValueError(
            "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening"
        )

    # dict because 1, 0 for label col are not guaranteed to be ordered
    naive_response_dict = dict()
    response_list = df.groupby(label_col).mean(response_col).collect()
    naive_response_dict[response_list[0][label_col]] = response_list[0][
        "avg({col})".format(col=response_col)]
    naive_response_dict[response_list[1][label_col]] = response_list[1][
        "avg({col})".format(col=response_col)]
    treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[
        0]
    logging.getLogger(__name__).info(
        "treatment_rate:{tr:.2f}   control_rate:{cr:.2f}".format(
            tr=treatment_rate, cr=control_rate))

    # return early if additional bias reduction is not applicable
    if all_count < NAIVE_THRESHOLD_COUNT:
        logging.getLogger(__name__).info(
            "additional bias adjustment inapplicable, returning naive difference"
        )
        return treatment_rate, control_rate, (control_rate - treatment_rate)

    logging.getLogger(__name__).info("additional bias adjustment possible")
    # choose fewer features if appropriate to prevent overfit. round down
    num_preds = int(
        df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1
    logging.getLogger(__name__).info(
        "need max {n:,} predictors".format(n=num_preds))
    if num_preds < len(list(pred_cols)):
        logging.getLogger(__name__).info(
            "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality"
            .format(np=num_preds, ep=len(pred_cols)))
        kwargs = {
            'df': df,
            'label_col': label_col,
            'binned_features_col': features_col,
            'ncols': num_preds
        }
        df, pred_cols = reduce_dimensionality(args=kwargs, method='chi')

    pred_cols_r = pred_cols + [label_col]
    assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r,
                                      outputCol='features_r')
    df = assembler_r.transform(df)
    _persist_if_unpersisted(df)
    lre_r = mlc.LogisticRegression(
        featuresCol='features_r',
        labelCol=response_col,
        predictionCol='prediction_{0}'.format(response_col),
        rawPredictionCol='rawPrediction_{0}'.format(response_col),
        probabilityCol='probability_{0}'.format(response_col))
    lrm_r = lre_r.fit(df)

    coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients))

    adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col]))
    logging.getLogger(__name__).info(
        "bias asjusted response is {ar:.2f}".format(ar=adjusted_response))
    return treatment_rate, control_rate, adjusted_response
def _transform(df: DataFrame,
               prob_mod: mlc.Model,
               method: Optional[str],
               metric: Optional[str],
               match_kwargs: Optional[dict] = None) -> Tuple[DataFrame, dict]:
    r""" execute one propensity match transform

    based on input vars, grab match col and run through algorithm to
    produce matched populations.


    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Dataframe with population in question. Must have featureCol and
        labelCol used in `prob_mod`
    prob_mod : mlc.Model
        the model predicting the probability that the row is in class 1
        in the label col.
    method : {'auto', 'quantile', 'assignment'}
        how matching occurs. auto will select according to the number of
        rows specified in config as `SMALL_MATCH_THRESHOLD`

        Quantile does stratified sampling on predicted probability.
        It guarantees similar population sizes and may drop some treatments
        non-symmetrically in order to fulfill that guarantee. match_info
        contains 'scale', what proportion of treatment users were used, and
        'dropped', proportion of sample dropped asymmetrically. The
        algorithm tries to maintain a balance between sample size and
        bias in deciding scale and droppped
    metric : {'probability'}
        the number that is being matched. Currently only support predicted
        probability but may add more in the future
    match_kwargs : dict, optional
        additional kwargs for match algorithm.


    Returns
    -------
    df : pyspark.sql.DataFrame
        df with only matched populations ( so dont overwrite your parent
        dataframe if you need it!)
    match_info : dict
        information about that particular match depending on the algorithm
        chosen.

    Raises
    ------
    UncaughtExceptions
    NotImplementedError
        illegal values for `method` and `metric`.

    See Also
    --------
    _get_metric
    _match

    Notes
    -----

    """

    # interpret input args:
    # only support quantile or assignment matching right now
    if match_kwargs is None:
        match_kwargs = {}

    logging.getLogger(__name__).info(
        "method is {method}".format(method=str(method)))

    if method is None:
        method = 'auto'
        logging.getLogger(__name__).info("assigning default arg 'auto'")
    elif method not in ['assignment', 'quantile', 'auto']:
        logging.getLogger(__name__).critical("invalid method argument")
        raise NotImplementedError(
            "method {method} not implemented".format(method=method))
    if method == 'auto':
        label_col = prob_mod.getOrDefault('labelCol')

        _persist_if_unpersisted(df)
        pos_count = df.where(F.col(label_col) == 1).count()
        neg_count = df.where(F.col(label_col) == 0).count()
        if ((pos_count**2) * neg_count) <= SMALL_MATCH_THRESHOLD:
            method = 'assignment'
            logging.getLogger(__name__).info("auto method is assignment")
        else:
            method = 'quantile'
            logging.getLogger(__name__).info("auto method is quantile")

    logging.getLogger(__name__).info(
        "metric is {metric}".format(metric=str(metric)))
    if metric is None:
        metric = 'probability'
        logging.getLogger(__name__).info(
            "assigning default metric 'probability'")
    elif metric not in ['probability']:
        logging.getLogger(__name__).critical("invalid metric argument")
        raise NotImplementedError(
            "metric {metric} not implemented".format(metric=metric))

    # step 1 calculate match metric
    df, metric_col = _get_metric(df, prob_mod, metric)
    # step 2 match
    df, match_info = _match(df, prob_mod, method, metric_col, match_kwargs)

    return df, match_info
def _assignment_match(df: DataFrame, prob_mod: mlc.Model,
                      metric_col: str) -> Tuple[DataFrame, dict]:
    r"""match treatment to controls 1:1

    Use Hungarian/Munkres algorithm in `metric_col` (typically probability)
    to find controls for your treatments with the least cost - the distance
    between a treatment's metric and its control's metric

    Parameters
    ----------
    df: DataFrame
        dataframe in question, must have input columns specified by
        prob_mod
    prob_mod: mlc.Model
        propenisty predicting model. used here mostly to grab label/feature
        columns. metric col should have been constructed by another method
        prior
    metric_col: str
        the column values to matched on 
    Returns
    -------
    df
        new dataframe with just the matched population
    match_info: dict
        dict of potentially handy metrics from the matching process
            scaled: proportion of treatments used in matching
            init_t_mean: mean treatment val of treatment candidates
            init_c_can_mean: mean metric val of control candidates
            init_t_count: number of treatments in input
            init_c_can_count: number of control candidates
            adj_t_count: number of treatments after adjusting population
                size to accomodate difference in probability distribution
                between treatment and control.
            total_cost : total_cost
            average_cost : average_cost
            dropped : frac of treatments dropped unbalanced

    Raises
    ------
    ValueError
        when the treatment population is too small and or unabalanced
        to produce a good match
    UncaughtExceptions

    See Also
    --------
    _adjust_balance
    _make_cost_matrix
    _execute_assignment_match
    _get_assigned_rows

    Notes
    -----
    In order to produce good matching, if the probability distributions
    are significantly different (i.e. treatment is significantly right
    shifted), the control candidate pop needs to be much greater.
    _adjust_balance achieves this by taking
    max num_control_candidates/(treatment_mean/control_can_mean). this
    will progressively make the treatment pop smaller as the right
    shift becomes greater. However, this created the danger of being left
    with a very small treatment population from which conclusions shouldnt
    be drawn. additionaly this method was devised, and not taken from a
    peer reviewed paper. This is a point which merits further investigation
    before it is considered canon.

    """

    label_col = prob_mod.getOrDefault('labelCol')
    t_df = df.where(F.col(label_col) == 1)
    c_can_df = df.where(F.col(label_col) == 0)

    t_adjusted_df, c_can_adjusted_df, match_info = _adjust_balance(
        t_df, c_can_df, metric_col)

    t_vals = t_adjusted_df.select(metric_col).toPandas()
    c_can_vals = c_can_adjusted_df.select(metric_col).toPandas()

    cost_matrix = _make_cost_matrix(
        t_vals=t_vals,
        c_can_vals=c_can_vals,
    )

    c_ind, t_ind, total_cost, average_cost = _execute_assignment_match(
        cost_matrix)
    match_info['total_cost'] = total_cost
    match_info['average_cost'] = average_cost
    df = _get_assigned_rows(t_ind=t_ind,
                            t_df=t_adjusted_df,
                            c_ind=c_ind,
                            c_can_df=c_can_adjusted_df)

    logging.getLogger(__name__).info(
        "matched df size is {n:,}".format(n=df.count()))
    match_info['dropped'] = 0

    return df, match_info
def _quantile_match(df: DataFrame,
                    prob_mod: mlc.Model,
                    metric_col: str,
                    ntile: int = 10,
                    quantile_error_scale: int = 5,
                    sample_size: int = 10**5) -> Tuple[DataFrame, dict]:
    r"""match by stratified sampling on probability bins. guarantee similar
    populations.

    match by stratified sampling on probability bins. guarantee similar
    populations. may scale treatment curve down and drop treatments
    unevenly to uphold guarantee

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    prob_mod : pyspark.ml.classification.Model
    metric_col : str
        name of col to be matched
    ntile : int
        how many buckets to make out of the metric col and then
        stratify sample
        defaults to 10
    quantile_error_scale: Union[int, float]
        error tolerance for calculating boundries for ntiles
        relativeError passed to approxQuantile is calculated as
        1/ntile/quantile_error_scale
        in other words 1/quantile_error_scale is how much error is ok
        as a fraction of the bin size
        be cognizant of ntile, and this value, as passing a small
        relativeError can increase compute time dramatically
        defaults to 5
    sample_size: Optional[int]
        size of sample used to calculate quantile bin boundaries
        no sampling if None, not recommended
        defauts to 10**5


    Returns
    -------
    df
        Explanation of anonymous return value of type ``type``.
    match_info : dict
        contains scale and dropped
        scale describes what proportion of the treatment group was used
        and dropped describes what proportion of the treatment group, after
        scaling, was dropped due to inadequate control candidates
        Explanation

    Raises
    ------
    UncaughtExceptions

    See Also
    --------
    _make_quantile_match_col
    _execute_quantile_match

    Notes
    -----

    """
    logging.getLogger(__name__).info(
        "starting _quantile_match with args ntile={ntile}, quantile_error_scale={qes}, /"
        "sample_size={sn}".format(ntile=ntile,
                                  qes=quantile_error_scale,
                                  sn=sample_size))

    label_col = prob_mod.getOrDefault('labelCol')
    df, match_col = _make_quantile_match_col(df, metric_col, label_col, ntile,
                                             quantile_error_scale, sample_size)
    df, match_info = _execute_quantile_match(df, match_col, label_col)
    match_info['type'] = 'quantile'

    return df, match_info