Beispiel #1
0
                'values': [train[col].max()]
            }
            for col in continuous + discrete
        })
        # BINNER_CONFIG = [{ col: {'bins': 3} for col in continuous + discrete },
        #     # { col: {'bins': 5} for col in continuous + discrete },
        #     { col: {'bins': 7} for col in continuous + discrete },
        #     { col: {'values': [train[col].max()]} for col in continuous + discrete }]
        top_cont = [
            'LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF',
            'GarageArea', 'TotalBsmtSF', 'GrLivArea'
        ]
        BOX_COX = BOX_COX_HO

    elif dataset == 'heart':
        data, labels, continuous, discrete, dummy, categorical, target, missing = get_heart(
            missing=MISSING)
        train = data.drop(target, axis=1)
        cv = KFold(5, shuffle=True, random_state=0)
        scorer = error_rate
        predictors = [
            LogisticRegression(),
            SVC(),
            RandomForestClassifier(),
            DecisionTreeClassifier(),
            KNeighborsClassifier(n_neighbors=5),
        ]
        # BINNER_CONFIG = { col: {'bins': 3} for col in continuous + discrete }
        binner = CustomBinner(
            {col: {
                'bins': 3
            }
Beispiel #2
0
def get_test_config_heart(missing=True):
    data, labels, continuous, discrete, dummy, categorical, target, missing = get_heart(
        test=False, missing=missing)
    test_data, test_labels = get_heart(test=True, missing=missing)[0:2]
    test = test_data.drop(target, axis=1)
    scorer = error_rate
    one_hot = CustomOneHotEncoder(columns=categorical)
    model = Pipeline([
        ('onehot', one_hot),
        ('clipper', None),
        ('binner', None),
        ('binner2', None),
        ('simple_imputer', None),
        ('zero_filler', ZeroFiller()),  # just in case there are any left
        ('main_imputer', None),
        ('dropper', FeatureDropper(drop=[])),
        ('poly', None),
        ('combinations', None),
        ('boxcox', None),
        ('scaler', None),
        ('reduce_dim', None),
        ('predictor', None)
    ])
    params = {
        'DecisionTreeClassifier_base': {
            'params': {
                'predictor':
                DecisionTreeClassifier(max_depth=None),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1974390243902439,
            'std': 0.0756691348271984
        },
        'KNeighborsClassifier_base': {
            'params': {
                'predictor':
                KNeighborsClassifier(n_neighbors=7
                                     # ,n_jobs=7
                                     ),
                'scaler':
                RobustScaler(),
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=[],
                    median=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1924390243902439,
            'std': 0.04087385740197896
        },
        'LogisticRegression_base': {
            'params': {
                'predictor':
                LogisticRegression(
                    # n_jobs=7,
                ),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1825609756097561,
            'std': 0.04141604180434009
        },

        #   'XGBClassifier_base': {'params': {'predictor': XGBClassifier(
        #              base_score=0.5,
        #       # n_jobs=7,
        #
        #         colsample_bytree=0.8, learning_rate=0.07,
        #         max_depth=7, n_estimators=200,),
        #  'scaler': None,
        #  'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[], nan_flag=[],
        #           zero=['trestbps', 'chol', 'thalach', 'oldpeak'])},
        # 'score': 0.16743902439024388,
        # 'std': 0.04176646782554455},
        'DecisionTreeClassifier_best': {
            'params': {
                'binner2':
                CustomBinner(
                    configuration={
                        'chol': {
                            'bins': 3
                        },
                        'thalach': {
                            'bins': 3
                        },
                        'oldpeak': {
                            'bins': 3
                        },
                        'trestbps': {
                            'bins': 3
                        },
                        'age': {
                            'bins': 3
                        },
                        'slope': {
                            'bins': 3
                        },
                        'ca': {
                            'bins': 3
                        }
                    }),
                'boxcox':
                None,
                'clipper':
                OutliersClipper(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'combinations':
                FeatureProduct(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'dropper__drop':
                ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'],
                'main_imputer':
                HotDeckFullImputer(col_k_pairs=[('trestbps', None),
                                                ('chol', None),
                                                ('thalach', None),
                                                ('oldpeak', None)],
                                   default_k=7),
                'poly':
                None,
                'predictor':
                DecisionTreeClassifier(max_depth=4),
                'reduce_dim':
                None,
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=[],
                    median=[],
                    nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    zero=['trestbps', 'chol', 'thalach', 'oldpeak'])
            },
            'score': 0.14780487804878048,
            'std': 0.03090350740255695
        },
        'LogisticRegression_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'chol': 0,
                    'thalach': 2,
                    'trestbps': 0
                }),
                'clipper':
                OutliersClipper(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'combinations':
                None,
                'dropper__drop': [],
                'main_imputer':
                ModelBasedFullImputer(
                    columns=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    model=LinearRegression(
                        # n_jobs=7
                    )),
                'poly':
                PolynomialsAdder(powers_per_column={
                    'chol': [2],
                    'thalach': [2],
                    'oldpeak': [2],
                    'trestbps': [2]
                }),
                'predictor':
                LogisticRegression(),
                'reduce_dim':
                PCA(n_components=10),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    zero=[])
            },
            'score': 0.14280487804878048,
            'std': 0.03915450868377355
        },
        #   'XGBClassifier_best': {'params':
        #       {
        #           'binner2': CustomBinner(configuration={'chol': {'bins': 3}, 'thalach': {'bins': 3}, 'oldpeak': {'bins': 3}, 'trestbps': {'bins': 3}, 'age': {'bins': 3}, 'slope': {'bins': 3}, 'ca': {'bins': 3}},
        #                 drop=False, nan=False),
        #          'boxcox': BoxCoxTransformer(lambdas_per_column={'chol': 0, 'thalach': 2, 'trestbps': 0}),
        #          'clipper': OutliersClipper(columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
        #          'combinations': None,
        #          'dropper__drop': ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'],
        #          'main_imputer': HotDeckFullImputer(col_k_pairs=[('trestbps', None), ('chol', None), ('thalach', None), ('oldpeak', None)],
        #                    default_k=7),
        #          'poly': None,
        #          'predictor': XGBClassifier(
        #              # n_jobs=7,
        #              base_score=0.5,
        #              colsample_bytree=0.8, learning_rate=0.07,
        #                 max_depth=7, n_estimators=200,),
        #          'reduce_dim': SelectFromModel(estimator=LogisticRegression(C=0.999, penalty='l1',
        #                                                                     # n_jobs=7
        #                                                                     )),
        #          'scaler': None,
        #          'simple_imputer': FillNaTransformer(from_dict={},
        #                   mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[],
        #                   nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'], zero=[])},
        # 'score': 0.15243902439024387,
        # 'std': 0.04655758333858798}
    }

    return data, test, test_labels, scorer, model, params, target