Exemple #1
0
def scale_onehot(df, target):
    """Perform basic scaling and one-hot encoding"""

    features = df.drop(target, axis=1)

    categorical_cols = [[f] for f in features.select_dtypes('object').columns]
    categorical_pipe = gen_features(columns=categorical_cols,
                                    classes=[{
                                        'class': SimpleImputer,
                                        'strategy': 'constant',
                                        'fill_value': 'Na'
                                    }, OneHotEncoder])

    numerical_cols = [[f] for f in features.select_dtypes('number').columns]
    numerical_pipe = gen_features(columns=numerical_cols,
                                  classes=[SimpleImputer, StandardScaler])

    mapper = DataFrameMapper(categorical_pipe + numerical_pipe, df_out=True)

    X = mapper.fit_transform(df)

    y = df[target]

    target_names = sorted(y.unique())

    y = pd.Categorical(y, categories=target_names, ordered=True)
    y = y.codes

    return X, y, target_names
Exemple #2
0
def make_pipeline_model(numeric_feature,
                        category_feature,
                        estimator,
                        X=None,
                        y=None):
    '''
    通过指定类别型和数值型特征构建以及指定的模型构建pipeline,如果给出数据集就完成训练,最终返回pipeline模型
    numeric_feature: 数值特征 list
    category_feature: 类别特征 list
    X:X数据 传入pandas.DataFrame对象
    y:Y数据 传入pandas.Series对象
    
    return:
    pipeline_model
    '''
    feature_def = gen_features(
        columns=category_feature,
        classes=[CategoricalDomain, CategoricalImputer, LabelBinarizer])
    mapper_numerical = DataFrameMapper([(numeric_feature, [
        ContinuousDomain(),
        SimpleImputer(strategy='mean'),
        StandardScaler()
    ])])
    mapper_category = DataFrameMapper(feature_def)
    mapper = FeatureUnion([('mapper_numerical', mapper_numerical),
                           ('mapper_category', mapper_category)])
    pipeline_model = PMMLPipeline([('mapper', mapper),
                                   ('classifier', estimator)])
    if X is not None and y is not None:
        pipeline_model.fit(X, y)
    return pipeline_model
Exemple #3
0
def clean_df(df, id_cols=[]):
    from sklearn_pandas import gen_features, DataFrameMapper, CategoricalImputer

    print("Imputation of numeric Columns")
    if id_cols.__len__() > 0:
        df.set_index(keys=id_cols, inplace=True)

    df_numeric = df.select_dtypes(include=[int, float])
    df_non_num = df.select_dtypes(exclude=[int, float])

    num_imp_train = gen_features(columns=df_numeric.columns,
                                 classes=[CategoricalImputer])

    # num_std_scale = gen_features(
    #     columns= df_numeric.columns,
    #     classes = [StandardScaler]
    # )

    num_map_train = DataFrameMapper(num_imp_train, df_out=True, input_df=True)
    # num_scale_map = DataFrameMapper(num_std_scale,df_out=True,input_df=True)
    print("Train Dataset numeric Impute")
    df_new = num_map_train.fit_transform(df_numeric)
    # print("Scaling Data")
    # df_new = num_scale_map.fit_transform(df_new)
    df_new = df_new.merge(df_non_num, left_index=True, right_index=True)
    print("Imputation with Mode Complete")
    return df_new
def create_mapper_sklearn_pandas_contrib(categorical_features,
                                         numeric_features):
    from sklearn_pandas import DataFrameMapper, gen_features

    categorial_maps = gen_features(columns=[[feature]
                                            for feature in categorical_features
                                            ],
                                   classes=[{
                                       'class': OneHotEncoder,
                                       'dtype': np.float32,
                                       'sparse': False,
                                       'handle_unknown': 'ignore'
                                   }])
    numeric_maps = gen_features(columns=[[feature]
                                         for feature in numeric_features],
                                classes=[StandardScaler])
    return DataFrameMapper(categorial_maps + numeric_maps, default=None)
Exemple #5
0
def scale_data(data):
    feature_def = gen_features(columns=data.columns.values.reshape(
        (-1, 1)).tolist(),
                               classes=[sklearn.preprocessing.StandardScaler])
    mapper = DataFrameMapper(feature_def, df_out=True)

    transformed_data = mapper.fit_transform(data)

    return mapper, transformed_data
    def transform_data(self, df, runtime_label):
        df_features, df_labels = df, df.pop(runtime_label)

        # Define which features are going to be transformed to a range of 0 to 1 (continuous)
        nfeats = gen_features(
            columns=[[i] for i in list(df_features.select_dtypes(include=[float]))],
            classes=[sklearn.preprocessing.MinMaxScaler]  
        )

        # Define which features are going to be binarized (categorical)
        sfeats = gen_features(
            columns=list(df.select_dtypes(include=[object])),
            classes=[sklearn.preprocessing.LabelBinarizer]  
        )

        # Do the transformations defined above
        mapper = DataFrameMapper(nfeats+sfeats,df_out=True)
        df_features = mapper.fit_transform(df_features)

        return df_features, df_labels
Exemple #7
0
def feature_union(category_feature,numeric_feature):
    mapper_category = DataFrameMapper(gen_features(
            columns = category_feature,
            # LabelEncoder
            classes = [CategoricalDomain,CategoricalImputer,LabelEncoder]
            ))
    mapper_numerical = DataFrameMapper([
            (numeric_feature,[ContinuousDomain(),SimpleImputer(strategy='mean'),StandardScaler()])
            ])
    pipeline_transformer = FeatureUnion([('mapper_category',mapper_category),\
                                         ('mapper_numerical',mapper_numerical)]) 
    return pipeline_transformer
Exemple #8
0
    def build_converter(self):
        """
        Prepares a mapper between Pandas Dataframe and sklearn matrix
        """

        label_encoding = gen_features(columns=self.label_fields,
                                      classes=[LabelBinarizer])

        categorical = gen_features(columns=[[f]
                                            for f in self.categorical_fields],
                                   classes=[{
                                       'class': SimpleImputer,
                                       'strategy': "most_frequent"
                                   }, {
                                       'class': OneHotEncoder,
                                       'sparse': False
                                   }],
                                   suffix="_cat")

        numeric = gen_features(columns=[[t[0]] for t in self.thresholds],
                               classes=[{
                                   'class': SimpleImputer,
                                   'strategy': 'median'
                               }])

        # if any boolean
        boolean = []
        for f in self.df.columns:
            if self.df[f].dtype == bool and f not in self.label_fields:
                self.df[f] = pd.to_numeric(self.df[f])
                boolean.append(([f], None))

        mapper_X = DataFrameMapper(
            numeric) if self.only_numeric else DataFrameMapper(categorical +
                                                               boolean +
                                                               numeric)
        mapper_y = DataFrameMapper(label_encoding)
        return mapper_X, mapper_y
Exemple #9
0
 def fit(self, X, y=None):
     self.ncols = []
     self.scols = []
     #         print("mapping features")
     for col in X:
         if X[col].dtype == float:
             # print("numerical col: %s" % col)
             self.ncols.append([col])
         else:
             # print("categorical col: %s" % col)
             self.scols.append([col])
     nfeats = gen_features(
         columns=self.ncols,
         classes=[{'class': sklearn.preprocessing.MinMaxScaler, }]
     )
     sfeats = gen_features(
         columns=self.scols,
         classes=[{'class': LabelBinarizer2}]
     )
     self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True)
     self.mapper.fit(X)
     #         print("features mapped")
     return self
Exemple #10
0
def _create_apply_transformers(df):
    from sklearn_pandas import DataFrameMapper
    import category_encoders as ce

    data_raw = df

    obj_cols = data_raw.select_dtypes("object").columns.to_list()

    from sklearn_pandas import gen_features

    feature_def = gen_features(
        columns=obj_cols,
        classes=[{
            "class": ce.OrdinalEncoder,
            "handle_unknown": "return_nan",
            "handle_missing": "return_nan"
        }],
    )

    mapper = DataFrameMapper(feature_def, default=None, df_out=True)

    data_transformed = mapper.fit_transform(data_raw)
    return data_transformed, mapper
Exemple #11
0
TARGET = df_train_flt['isPurchase'].copy()
TRAIN = df_train_flt.drop('isPurchase', axis=1).copy()

# 1. Drop some columns
use_cols = [
    'channelGrouping', 'visitNumber', 'device.deviceCategory',
    'device.isMobile', 'totals.hits', 'totals.newVisits', 'totals.pageviews',
    'trafficSource.isTrueDirect'
]
TRAIN = TRAIN[use_cols]

# Preprocessing
# Pipeline
feature_cat = gen_features(
    columns=['channelGrouping', 'device.deviceCategory'],
    classes=[ModifiedLabelEncoder, OneHotEncoder])
feature_num = gen_features(columns=[['visitNumber'], ['device.isMobile'],
                                    ['totals.hits'], ['totals.newVisits'],
                                    ['totals.pageviews'],
                                    ['trafficSource.isTrueDirect']],
                           classes=[StandardScaler])
mapper = DataFrameMapper(feature_cat + feature_num, input_df=True, df_out=True)
TRAIN_preprocessed = mapper.fit_transform(TRAIN.copy())

# Spilt train/test sets
X_train, X_test, y_train, y_test = train_test_split(TRAIN_preprocessed,
                                                    TARGET,
                                                    random_state=26)

# Auto-sklearn
Exemple #12
0
df_train = loader.read_original_data(table_code='train')
df_test = loader.read_original_data(table_code='test')

# Consider only a subset of columns
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

USE_COLS = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
TARGET = ['Survived']
X_train = df_train[USE_COLS].copy()
y_train = df_train[TARGET].copy().values.reshape(-1, )
X_test = df_test[USE_COLS].copy()

# Preprocessing
feature_cat = gen_features(
    columns=['Pclass', 'Sex', 'Embarked'],
    classes=[CategoricalImputer, ModifiedLabelEncoder, OneHotEncoder])
feature_num = gen_features(columns=[['Age'], ['SibSp'], ['Parch'], ['Fare']],
                           classes=[Imputer, StandardScaler])
mapper = DataFrameMapper(feature_cat + feature_num, input_df=True, df_out=True)

X_train_fit = mapper.fit_transform(X_train.copy())

# Training
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
Exemple #13
0
data.columns = [str(c) for c in data.columns]

# Shuffle, Shuffle and Shuffle!
data = data.sample(frac=1)


num_cols = len(data.columns)
X  = data.iloc[:,1:num_cols-1]
y  = data.iloc[:,num_cols-1]

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

#Preprocess X_train
feature_def = gen_features(
     columns=[[c] for c in X_train.columns[:7]],
     classes=[MinMaxScaler]
 )

feature_def += ((pos_col, [LabelBinarizer()]),)

svc_preprocessor = DataFrameMapper(feature_def)
X_train = svc_preprocessor.fit_transform(X_train)
svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' % (nrows,))
joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb'))
X_test = svc_preprocessor.transform(X_test)
#####

#Didn't help!!
#X_train, y_train = downsample_negatives(X_train, y_train)

for cv in [1,10,20]:
# Consider only a subset of columns
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)
#print(df_train.head())

USE_COLS = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
TARGET = ['Survived']
X_train = df_train[USE_COLS].copy()
y_train = df_train[TARGET].copy().values.reshape(-1,)
X_test = df_test[USE_COLS].copy()

# Preprocessing
# 1. 1-hot encode categorical columns
feature_cat = gen_features(columns=['Pclass', 'Sex', 'Embarked'],
                           classes=[CategoricalImputer, {'class': FunctionTransformer,
                                                         'func': pd.get_dummies,
                                                         'validate':False}]
                           )
feature_num = gen_features(columns=[['Age'], ['SibSp'], ['Parch'], ['Fare']],
                           classes=[Imputer, StandardScaler])
'''
mapper = DataFrameMapper([
    ('Sex', [CategoricalImputer(), FunctionTransformer(pd.get_dummies, validate=False)]),
    ('Embarked', [CategoricalImputer(), FunctionTransformer(pd.get_dummies, validate=False)]),
    (['Age', 'SibSp', 'Parch', 'Fare'], [Imputer(), StandardScaler()])
], df_out=True)
'''
mapper = DataFrameMapper(
    feature_cat + feature_num,
    input_df=True, df_out=True)
 df = df.sort_values([
     'DayOfYear', 'Carrier', 'Origin', 'Dest', 'FlightNum', 'CRSDepTime',
     'CRSArrTime'
 ],
                     inplace=False).reset_index(drop=True)
 # Select subset
 training = df[[
     'FlightNum', 'DayOfWeek', 'DayOfMonth', 'Carrier', 'Origin', 'Dest',
     'Distance', 'DepDelay', 'CRSDepTime', 'CRSArrTime', 'DayOfYear'
 ]]
 training.loc[:,
              ['CRSDepTime', 'CRSArrTime'
               ]] = training.loc[:,
                                 ['CRSDepTime', 'CRSArrTime']].astype(int)
 # Convert categoricals to indicators.
 feature_def = gen_features(columns=['Carrier', 'Origin', 'Dest'],
                            classes=[LabelBinarizer])
 mapper = DataFrameMapper(feature_def, default=None)
 training_vectors = mapper.fit_transform(training)
 results_vector = df.ArrDelay.values
 df_training = pd.DataFrame(columns=mapper.transformed_names_,
                            data=training_vectors)
 # Generate train/test sets.
 X_train, X_test, y_train, y_test = train_test_split(training_vectors,
                                                     results_vector,
                                                     test_size=0.1,
                                                     random_state=43)
 # Do the regression.
 regressor = LinearRegression()
 regressor.fit(X_train, y_train)
 predicted = regressor.predict(X_test)
 # Show results
Exemple #16
0
data.columns = [str(c) for c in data.columns]

# Shuffle, Shuffle and Shuffle!
data = data.sample(frac=1)

num_cols = len(data.columns)
X = data.iloc[:, 1:num_cols - 1]
y = data.iloc[:, num_cols - 1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

#Preprocess X_train
feature_def = gen_features(columns=[[c] for c in X_train.columns[:7]],
                           classes=[MinMaxScaler])

feature_def += ((pos_col, [LabelBinarizer()]), )

svc_preprocessor = DataFrameMapper(feature_def)
X_train = svc_preprocessor.fit_transform(X_train)
svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' %
                                   (nrows, ))
joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb'))
X_test = svc_preprocessor.transform(X_test)
#####

#Didn't help!!
#X_train, y_train = downsample_negatives(X_train, y_train)

for cv in [1, 10, 20]:
Exemple #17
0
def create_preprocessing_pipeline() -> Pipeline:
    def impute_garage_yr_blt(X, y):
        X.loc[X['GarageYrBlt'].isnull(),
              ['GarageYrBlt']] = X.loc[X['GarageYrBlt'].isnull(), 'YearBuilt']
        return X

    pipeline = Pipeline(steps=[
        ('drop_id', tr.DropColumns(columns_to_drop=['Id'])),
        ('drop_cols_with_missing_values',
         tr.DropColumns(columns_to_drop=[
             'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'LotFrontage'
         ])),
        ('drop_electrical_rows',
         tr.DataFrameFunctionTransformer(
             lambda X, y: X.drop(X.loc[X['Electrical'].isnull()].index))),
        ('impute_garage_columns',
         DataFrameMapper(gen_features(columns=[['GarageType'], [
             'GarageFinish'
         ], ['GarageQual'], ['GarageCond']],
                                      classes=[{
                                          'class': SimpleImputer,
                                          'strategy': 'constant',
                                          'fill_value': 'No'
                                      }]),
                         input_df=True,
                         df_out=True,
                         default=None)),
        ('impute_garage_yr_blt',
         tr.DataFrameFunctionTransformer(impute_garage_yr_blt)),
        ('impute_bsmt_columns',
         DataFrameMapper(gen_features(
             columns=[['BsmtCond'], ['BsmtQual'], ['BsmtExposure'],
                      ['BsmtFinType1'], ['BsmtFinType2']],
             classes=[{
                 'class': SimpleImputer,
                 'strategy': 'constant',
                 'fill_value': 'No'
             }]),
                         input_df=True,
                         df_out=True,
                         default=None)),
        ('impute_mas_vnr_type',
         DataFrameMapper([(
             ['MasVnrType'],
             SimpleImputer(strategy='constant', fill_value='None'))],
                         input_df=True,
                         df_out=True,
                         default=None)),
        ('impute_mas_vnr_area',
         DataFrameMapper([(['MasVnrArea'],
                           SimpleImputer(strategy='constant', fill_value=0))],
                         input_df=True,
                         df_out=True,
                         default=None)),

        # partial == (lambda y: (lambda x: y * x))(i)
        ('replace_values',
         DataFrameMapper([(
             key,
             tr.SeriesFunctionTransformer(
                 partial(lambda value, col: col.replace(value), value)))
                          for (key, value) in {
                              "MSSubClass": {
                                  20: "SC20",
                                  30: "SC30",
                                  40: "SC40",
                                  45: "SC45",
                                  50: "SC50",
                                  60: "SC60",
                                  70: "SC70",
                                  75: "SC75",
                                  80: "SC80",
                                  85: "SC85",
                                  90: "SC90",
                                  120: "SC120",
                                  150: "SC150",
                                  160: "SC160",
                                  180: "SC180",
                                  190: "SC190"
                              },
                              "MoSold": {
                                  1: "Jan",
                                  2: "Feb",
                                  3: "Mar",
                                  4: "Apr",
                                  5: "May",
                                  6: "Jun",
                                  7: "Jul",
                                  8: "Aug",
                                  9: "Sep",
                                  10: "Oct",
                                  11: "Nov",
                                  12: "Dec"
                              },
                              "BsmtCond": {
                                  "No": 0,
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "BsmtExposure": {
                                  "No": 0,
                                  "Mn": 1,
                                  "Av": 2,
                                  "Gd": 3
                              },
                              "BsmtFinType1": {
                                  "No": 0,
                                  "Unf": 1,
                                  "LwQ": 2,
                                  "Rec": 3,
                                  "BLQ": 4,
                                  "ALQ": 5,
                                  "GLQ": 6
                              },
                              "BsmtFinType2": {
                                  "No": 0,
                                  "Unf": 1,
                                  "LwQ": 2,
                                  "Rec": 3,
                                  "BLQ": 4,
                                  "ALQ": 5,
                                  "GLQ": 6
                              },
                              "BsmtQual": {
                                  "No": 0,
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "ExterCond": {
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "ExterQual": {
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "Functional": {
                                  "Sal": 1,
                                  "Sev": 2,
                                  "Maj2": 3,
                                  "Maj1": 4,
                                  "Mod": 5,
                                  "Min2": 6,
                                  "Min1": 7,
                                  "Typ": 8
                              },
                              "GarageCond": {
                                  "No": 0,
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "GarageQual": {
                                  "No": 0,
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "HeatingQC": {
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "KitchenQual": {
                                  "Po": 1,
                                  "Fa": 2,
                                  "TA": 3,
                                  "Gd": 4,
                                  "Ex": 5
                              },
                              "LandSlope": {
                                  "Sev": 1,
                                  "Mod": 2,
                                  "Gtl": 3
                              },
                              "LotShape": {
                                  "IR3": 1,
                                  "IR2": 2,
                                  "IR1": 3,
                                  "Reg": 4
                              },
                              "PavedDrive": {
                                  "N": 0,
                                  "P": 1,
                                  "Y": 2
                              },
                              "Street": {
                                  "Grvl": 1,
                                  "Pave": 2
                              },
                              "Utilities": {
                                  "ELO": 1,
                                  "NoSeWa": 2,
                                  "NoSewr": 3,
                                  "AllPub": 4
                              }
                          }.items()],
                         input_df=True,
                         df_out=True,
                         default=None)),
        ('drop_suspicious_columns',
         tr.DropColumns(columns_to_drop=[
             'Utilities', 'Street', 'Condition2', 'RoofMatl', 'Heating',
             'KitchenAbvGr', 'PoolArea'
         ]))
    ])
    return pipeline
Exemple #18
0
def main():
    # set parameters and retrieve data
    begin_week = 1
    end_week = 11
    window_size = 5

    records = retrieve_spl9_overall_records()
    records = add_week(records, 10, semis_matchups)
    records = add_week(records, end_week, finals_matchups)
    matchups = get_matchups(records)
    info = get_matchups_info(records, matchups, weeks(begin_week, end_week))
    instances = get_instances(info, end_week, window_size)
    train_instances = instances.loc[[*weeks(begin_week, end_week - 1)]]
    predict_instances = instances.loc[to_week_label(end_week)]

    # Create label transformations and attribute normalizations
    attribute_mapper = DataFrameMapper([
        *gen_features(columns=[['cost_x'], ['cost_y']],
                      classes=[{
                          'class': StandardScaler
                      }]),
        *gen_features(columns=append_player_subscripts([
            *('result-{}'.format(w + 1)
              for w in range(window_size)), *('tier-{}'.format(w + 1)
                                              for w in range(window_size))
        ]),
                      classes=[LabelBinarizer]),
        *gen_features(columns=append_player_subscripts(tier_list),
                      classes=[LabelBinarizer]), ('tier', LabelBinarizer())
    ])
    label_mapper = DataFrameMapper([('result', LabelBinarizer())])

    X = attribute_mapper.fit_transform(train_instances.copy())
    y = label_mapper.fit_transform(train_instances.copy()).ravel()
    X_act = attribute_mapper.transform(predict_instances.copy())
    y_act = label_mapper.transform(predict_instances.copy()).ravel()

    print(X.shape)
    print(X_act.shape)

    seed = 2718281828
    validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    with_prob = 'accuracy'
    without_prob = 'accuracy'

    trees_clf = RandomizedSearchCV(ExtraTreesClassifier(random_state=seed),
                                   cv=validation,
                                   n_iter=500,
                                   random_state=seed,
                                   scoring=with_prob,
                                   param_distributions={
                                       'n_estimators':
                                       stats.randint(low=40, high=200),
                                       'max_features':
                                       stats.uniform(loc=0.01, scale=0.99),
                                       'max_depth':
                                       stats.randint(low=1, high=10)
                                   })
    trees_clf.fit(X, y)
    print("Extra Trees:")
    print(trees_clf.best_params_)
    print(trees_clf.best_score_)
    print(accuracy_score(y_act, trees_clf.predict(X_act)))

    grad_clf = RandomizedSearchCV(
        GradientBoostingClassifier(random_state=seed),
        cv=validation,
        n_iter=500,
        random_state=seed,
        scoring=with_prob,
        param_distributions={
            'loss': ['exponential'],
            'learning_rate': stats.uniform(loc=1, scale=3),
            'n_estimators': stats.randint(low=40, high=200),
            'max_depth': stats.randint(low=1, high=10),
            'min_samples_split': stats.randint(low=2, high=13),
            'max_features': stats.uniform(loc=0.01, scale=0.99)
        })
    grad_clf.fit(X, y)
    print("Gradient Boosting:")
    print(grad_clf.best_params_)
    print(grad_clf.best_score_)
    print(accuracy_score(y_act, grad_clf.predict(X_act)))

    forest_clf = RandomizedSearchCV(RandomForestClassifier(random_state=seed),
                                    cv=validation,
                                    n_iter=500,
                                    random_state=seed,
                                    scoring=with_prob,
                                    param_distributions={
                                        'n_estimators':
                                        stats.randint(low=10, high=100),
                                        'max_features':
                                        stats.uniform(loc=0.01, scale=0.99),
                                        'max_depth':
                                        stats.randint(low=1, high=10)
                                    })
    forest_clf.fit(X, y)
    print("Random Forest:")
    print(forest_clf.best_params_)
    print(forest_clf.best_score_)
    print(accuracy_score(y_act, forest_clf.predict(X_act)))

    ada_clf = RandomizedSearchCV(AdaBoostClassifier(DecisionTreeClassifier(),
                                                    random_state=seed),
                                 cv=validation,
                                 n_iter=500,
                                 random_state=seed,
                                 scoring=with_prob,
                                 param_distributions={
                                     'base_estimator__max_depth':
                                     stats.randint(low=1, high=10),
                                     'n_estimators':
                                     stats.randint(low=10, high=100),
                                     'learning_rate':
                                     stats.uniform(loc=1.29, scale=0.06)
                                 })
    ada_clf.fit(X, y)
    print("AdaBoost:")
    print(ada_clf.best_params_)
    print(ada_clf.best_score_)
    print(accuracy_score(y_act, ada_clf.predict(X_act)))

    bagging_clf = RandomizedSearchCV(BaggingClassifier(random_state=seed),
                                     cv=validation,
                                     n_iter=500,
                                     random_state=seed,
                                     scoring=with_prob,
                                     param_distributions={
                                         'n_estimators':
                                         stats.randint(low=57, high=75),
                                         'max_samples':
                                         stats.randint(low=6, high=9)
                                     })
    bagging_clf.fit(X, y)
    print("Bagging:")
    print(bagging_clf.best_params_)
    print(bagging_clf.best_score_)
    print(accuracy_score(y_act, bagging_clf.predict(X_act)))

    svc_clf = RandomizedSearchCV(SVC(random_state=seed),
                                 cv=validation,
                                 n_iter=500,
                                 random_state=seed,
                                 scoring=with_prob,
                                 param_distributions={
                                     'kernel': ['poly'],
                                     'degree': stats.randint(low=2, high=4),
                                     'C': stats.uniform(loc=1, scale=13),
                                     'coef0': stats.uniform(loc=-7, scale=8)
                                 })
    svc_clf.fit(X, y)
    print("SVC:")
    print(svc_clf.best_params_)
    print(svc_clf.best_score_)
    print(accuracy_score(y_act, svc_clf.predict(X_act)))

    nn_clf = RandomizedSearchCV(MLPClassifier(random_state=seed),
                                cv=validation,
                                n_iter=500,
                                random_state=seed,
                                scoring=with_prob,
                                param_distributions={
                                    'activation': ['relu'],
                                    'solver': ['lbfgs'],
                                    'hidden_layer_sizes': [(7, )],
                                    'alpha':
                                    stats.uniform(loc=0.1e-4, scale=1.5e-4)
                                })
    nn_clf.fit(X, y)
    print("Neural Network:")
    print(nn_clf.best_params_)
    print(nn_clf.best_score_)
    print(accuracy_score(y_act, nn_clf.predict(X_act)))

    base_voting_clf = VotingClassifier([
        ('ada', AdaBoostClassifier(DecisionTreeClassifier(random_state=seed))),
        ('svc', SVC(random_state=seed)),
        ('grad', GradientBoostingClassifier(random_state=seed))
    ])
    voting_clf = RandomizedSearchCV(base_voting_clf,
                                    cv=validation,
                                    n_iter=500,
                                    scoring=with_prob,
                                    param_distributions={
                                        'voting': ['hard', 'soft'],
                                        'svc__probability': [True],
                                        'svc__kernel': ['sigmoid'],
                                        'svc__gamma':
                                        stats.uniform(loc=5.1e-2,
                                                      scale=0.1e-2),
                                        'svc__coef0':
                                        stats.uniform(loc=8.5e-2,
                                                      scale=0.2e-2),
                                        'svc__C':
                                        stats.uniform(loc=5.4, scale=0.02),
                                        'ada__base_estimator__max_depth': [1],
                                        'ada__n_estimators':
                                        stats.randint(low=38, high=40),
                                        'ada__learning_rate':
                                        stats.uniform(loc=1.29, scale=0.06)
                                    })
    voting_clf.fit(X, y)
    print("Voting:")
    print(voting_clf.best_params_)
    print(voting_clf.best_score_)
    print(accuracy_score(y_act, voting_clf.predict(X_act)))
Exemple #19
0
    def train(self):
        import xgboost
        from baikal import make_step, Step, Input, Model
        from baikal.steps import Stack
        from sklearn_pandas import gen_features
        import custom_transformations as ct
        from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep

        # these are the categorical columns in the dataset
        CATEGORICAL_COLUMNS = [
            'KitchenQual',
            'MSSubClass',
            'MSZoning',
            'Street',
            'Alley',
            'LotShape',
            'LandContour',
            'Utilities',
            'LotConfig',
            'LandSlope',
            'Neighborhood',
            'Condition1',
            'Condition2',
            'BldgType',
            'HouseStyle',
            'RoofStyle',
            'RoofMatl',
            'Exterior1st',
            'Exterior2nd',
            'MasVnrType',
            'ExterQual',
            'ExterCond',
            'Foundation',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'Heating',
            'HeatingQC',
            'CentralAir',
            'Functional',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PavedDrive',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'SaleType',
            'SaleCondition',
            'OverallQual',
            'OverallCond',
        ]

        # these columns will be terated as a numerical columns
        NUMERICAL_COLUMNS = [
            'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
            'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
            'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
        ]

        # These columns have missing values and the one for which we will add missing indicator variable
        MISSING_INDICATOR = [
            'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
            'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
            'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
            'MiscFeature'
        ]

        ## Categorical Columns for which we want One Hot Encoding
        ONEHOT_COLUMNS = [
            'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
            'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2',
            'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType',
            'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
            'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
            'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
            'MiscFeature', 'SaleType', 'SaleCondition'
        ]

        ## Categorical Columns for which we want to have target encoding
        TARGET_COLUMNS = [
            'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd'
        ]

        ## Columns for that require log transformations
        LOG_COLUMNS = [
            'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
            'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
        ]

        # Define Steps
        ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet')
        ConcatStep = make_step(ConcatDataFrame, class_name='Concat')
        XGBRegressorStep = make_step(xgboost.XGBRegressor,
                                     class_name='XGBRegressor')
        LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression,
                                         class_name='LinearRegression')

        # Define sklearn-pandas transformations. Here I am using gen_features utility to
        # define transformations for individual columns.
        baseProcessing = (
            gen_features(columns=[[x] for x in MISSING_INDICATOR],
                         classes=[{
                             'class': MissingIndicator,
                             'features': 'all',
                             'sparse': False,
                             'error_on_new': False
                         }],
                         prefix='na_') +
            gen_features(
                columns=LOG_COLUMNS,
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }, {
                    'class': FunctionTransformer,
                    'func': np.log1p
                }]) +
            gen_features(
                columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)),
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }],
            ) + [
                # constructing new features -- age of the house
                (['YrSold', 'YearBuilt'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'age'
                }),

                # constructing new feature -- remodeling age
                (['YrSold', 'YearRemodAdd'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'remodel_age'
                }),

                # new feature -- total surface area
                (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [
                    FunctionTransformer(lambda x: np.nansum(x, axis=1)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'numerical_TotalArea'
                })
            ])

        # Since CatBoost model can handle categorical data, we don't need to encode categorical variables
        # we will simply impute missing values and let CatBoost model handle categorical data.
        catModelPreprocessing = gen_features(
            columns=CATEGORICAL_COLUMNS,
            classes=[{
                'class': FunctionTransformer,
                'func': lambda x: x.astype(np.object).reshape(-1, 1)
            }, {
                'class': SimpleImputer,
                'strategy': 'most_frequent'
            }],
        )

        # for regression and XGBoost, we will need to encode categorical variables ourselfs.
        # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding.
        regressionModelProcessing = (
            gen_features(columns=[[x] for x in ONEHOT_COLUMNS],
                         classes=[{
                             'class': OneHotEncoder,
                             'handle_unknown': 'ignore',
                             'sparse': False
                         }]) + gen_features(columns=[[x]
                                                     for x in TARGET_COLUMNS],
                                            classes=[
                                                {
                                                    'class': TargetEncoder
                                                },
                                                {
                                                    'class': SimpleImputer,
                                                    'strategy': 'mean'
                                                },
                                            ]))

        # Define DAG
        x = Input(name="x")
        y = Input(name='y')

        # Define feature transformations
        d0 = DataFrameMapperStep(baseProcessing,
                                 df_out=True,
                                 name='BasePreprocess')(x, y)
        d1 = DataFrameMapperStep(regressionModelProcessing,
                                 df_out=True,
                                 name='RegressionModelPreprocess')(x, y)
        d2 = DataFrameMapperStep(catModelPreprocessing,
                                 df_out=True,
                                 name='CatModelPreprocess')(x, y)

        # Consolidate features for catboost and elasticnet
        regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1])
        catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2])

        # Generate predictions using three different algorithms.
        m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y)
        m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y)
        m3 = CatBoostRegressorStep(name='CatBoost',
                                   cat_features=CATEGORICAL_COLUMNS,
                                   iterations=10)(catFeatures, y)

        # combine predictions from the three models
        combinedPredictions = Stack(name='CombinePredictions')([m1, m3])

        # construct an ensemble model
        ensembleModel = LinearRegressionStep()(combinedPredictions, y)
        model = Model(x, ensembleModel, y)
        model.fit(self.trainDF, self.trainDF['SalePrice'])
        self.artifact = {
            'model.pkl': cloudpickle.dumps(model),
            'environment': {
                'pip': {}
            }
        }
        self.next(self.end)
Exemple #20
0
    with open(df_name.replace('.csv', '') + '_gbk.csv' , 'w') as f:
        f.write(data)
    f.close()
    df = pd.read_csv(df_name.replace('.csv', '') + '_gbk.csv', encoding='gbk' )
    return df
data_4 = read_csv_gbk('算话测试报告-恒普-20200826.part01//算话变量_个人资料置信度_8w.csv')
#data_4 = pd.read_csv(r'C:\Users\徐钦华\Desktop\数据分析项目\算话测试\算话测试报告-恒普-20200826.part01\算话变量_个人资料置信度_8w.csv',
#                     engine='python')
data_5 = pd.read_csv('算话测试报告-恒普-20200826.part01//算话变量_团伙风险识别_8w.csv',engine='python')
label_data=label_data[label_data['数据编号'].notna()]

mapper_2 = DataFrameMapper([('pdls041',LabelBinarizer())],default=None,df_out=True)
data_2=mapper_2.fit_transform(data_2.copy())
mapper_3=DataFrameMapper([('z_risk_rate',LabelBinarizer())],default=None,df_out=True)
data_3 = mapper_3.fit_transform(data_3.copy())
feature_def_4 = gen_features(columns=[[m] for m in ['xx'+str(i) for i in range(598,619)]+['xx1247']+['xx'+str(i) for i in range(2531,2538)]],classes=[MinMaxScaler,StandardScaler])
mapper_4=DataFrameMapper(feature_def_4,default=None,input_df=True,df_out=True)
data_4=np.round(mapper_4.fit_transform(data_4.copy()),2)
feature_def_5=gen_features(columns=[['xx1174'],['xx1175'],['xx1176'],['xx1183'],['xx1184'],['xx1185'],['xx631'],['xx632'],['xx633'],['xx2495'],
                                  ['xx2492'],['xx2538']],classes=[MinMaxScaler,StandardScaler])
mapper_5 = DataFrameMapper(feature_def_5,default=None,input_df=True,df_out=True)
data_5 = np.round(mapper_5.fit_transform(data_5.copy()),2)
for i in ['R01','R02','R07']:
    data_3[i]=0
    data_3[i][(data_3['z_risk_reason'].notna())&(data_3['z_risk_reason'].str.contains(i))] = 1
for i in['R03','R04','R05','R06']:
    data_3['R_risk']=0
    data_3['R_risk'][(data_3['z_risk_reason'].notna())&(data_3['z_risk_reason'].str.contains(i))] = 1
for j in ['S100','S200','S210','S220','S221','S222','S223','S224','S225']:
    data_3[j]=0
    data_3[j][(data_3['z_business_source'].notna())&(data_3['z_business_source'].str.contains(j))]=1
Exemple #21
0
    y_test_df = pd.read_csv(y_test_path)
    y_test = y_test_df.get('Survived')

    y = train_df.get('Survived')

    X = train_df.drop('Survived', axis=1)

    categorical_features = ['Sex', 'Embarked']
    numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    features_def = []
    if categorical_features and len(categorical_features) > 0:
        for feature in categorical_features:
            categorical_feature_def = gen_features(
                columns=[[feature]],
                classes=[
                    {'class': SimpleImputer, 'strategy': 'most_frequent'},
                    {'class': OneHotEncoder, 'handle_unknown': 'ignore'}
                ]
            )
            features_def = features_def + categorical_feature_def

    if numerical_features and len(numerical_features) > 0:
        for feature in numerical_features:
            numerical_feature_def = gen_features(
                columns=[[feature]],
                classes=[
                    {'class': SimpleImputer, 'strategy': 'mean'},
                    {'class': StandardScaler},
                ]
            )
            features_def = features_def + numerical_feature_def
Exemple #22
0
ordinal_features = ['level.mito', 'level.ribo']

numeric_features = X.columns.values.tolist()

for i in boolean_features:
    X[i] = X[i].astype(int)

for i in categorical_features + boolean_features + ordinal_features:
    numeric_features.remove(i)

# Uses extracts from https://github.com/kinir/catboost-with-pipelines/blob/master/sklearn-pandas-catboost.ipynb.

gen_category = gen_features(
    columns=[[i] for i in categorical_features + boolean_features],
    classes=[{
        "class": SimpleImputer,
        "strategy": "most_frequent"
    }, {
        "class": OneHotEncoder
    }])

gen_category_ord_enc = gen_features(
    columns=[[i] for i in categorical_features + boolean_features],
    classes=[{
        "class": SimpleImputer,
        "strategy": "most_frequent"
    }, {
        "class": OrdinalEncoder,
        "dtype": np.int8
    }])

gen_ordinal = gen_features(columns=[[i] for i in ordinal_features],
def map_features(features=[]):
    numerical_def = gen_features(columns=[[c] for c in features],
                                 classes=[{
                                     'class': StandardScaler
                                 }])
    return numerical_def