Exemple #1
0
def impute_data(training_data: DataFrame,
                testing_data: DataFrame) -> Tuple[DataFrame, DataFrame]:
    """Calculate imputations from the training data and apply to the testing data
    
    Inputs
        training_data: DataFrame
            training data
        testing_data: DataFrame
            testing data
        
    Returns 
        training_data: imputations based on the training data only
        testing_data: imputations based on the training data only
    """

    # calculate from training data, impute into testing data

    # make training imputations
    training_data.CODE_GENDER = training_data.CODE_GENDER.fillna(
        get_mode(training_data.CODE_GENDER.dropna()))
    training_data.NAME_TYPE_SUITE = training_data.NAME_TYPE_SUITE.fillna(
        get_mode(training_data.NAME_TYPE_SUITE.dropna()))
    training_data.OBS_30_CNT_SOCIAL_CIRCLE = training_data.OBS_30_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.OBS_30_CNT_SOCIAL_CIRCLE.dropna()))
    training_data.DEF_30_CNT_SOCIAL_CIRCLE = training_data.DEF_30_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.DEF_30_CNT_SOCIAL_CIRCLE.dropna()))
    training_data.OBS_60_CNT_SOCIAL_CIRCLE = training_data.OBS_60_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.OBS_60_CNT_SOCIAL_CIRCLE.dropna()))
    training_data.DEF_60_CNT_SOCIAL_CIRCLE = training_data.DEF_60_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.DEF_60_CNT_SOCIAL_CIRCLE.dropna()))
    training_data.AMT_ANNUITY = training_data.AMT_ANNUITY.fillna(
        np.median(training_data.AMT_ANNUITY.dropna()))
    training_data.ANNUITY_INCOME_RATIO = training_data.ANNUITY_INCOME_RATIO.fillna(
        np.median(training_data.ANNUITY_INCOME_RATIO.dropna()))
    training_data.DAYS_LAST_PHONE_CHANGE = training_data.DAYS_LAST_PHONE_CHANGE.fillna(
        np.median(training_data.DAYS_LAST_PHONE_CHANGE.dropna()))

    # make testing imputation based on training data
    testing_data.CODE_GENDER = testing_data.CODE_GENDER.fillna(
        get_mode(training_data.CODE_GENDER.dropna()))
    testing_data.NAME_TYPE_SUITE = testing_data.NAME_TYPE_SUITE.fillna(
        get_mode(training_data.NAME_TYPE_SUITE.dropna()))
    testing_data.OBS_30_CNT_SOCIAL_CIRCLE = testing_data.OBS_30_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.OBS_30_CNT_SOCIAL_CIRCLE.dropna()))
    testing_data.DEF_30_CNT_SOCIAL_CIRCLE = testing_data.DEF_30_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.DEF_30_CNT_SOCIAL_CIRCLE.dropna()))
    testing_data.OBS_60_CNT_SOCIAL_CIRCLE = testing_data.OBS_60_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.OBS_60_CNT_SOCIAL_CIRCLE.dropna()))
    testing_data.DEF_60_CNT_SOCIAL_CIRCLE = testing_data.DEF_60_CNT_SOCIAL_CIRCLE.fillna(
        np.median(training_data.DEF_60_CNT_SOCIAL_CIRCLE.dropna()))
    testing_data.AMT_ANNUITY = testing_data.AMT_ANNUITY.fillna(
        np.median(training_data.AMT_ANNUITY.dropna()))
    testing_data.ANNUITY_INCOME_RATIO = testing_data.ANNUITY_INCOME_RATIO.fillna(
        np.median(training_data.ANNUITY_INCOME_RATIO.dropna()))
    testing_data.DAYS_LAST_PHONE_CHANGE = testing_data.DAYS_LAST_PHONE_CHANGE.fillna(
        np.median(training_data.DAYS_LAST_PHONE_CHANGE.dropna()))

    return training_data, testing_data