def calculate_filter_f1(dataset, filter, injector, rate=0.1): # Reading dataset if dataset.endswith("json"): data = pd.read_json(DATASETS_PATH + dataset) elif dataset.endswith("arff"): data = arff_io.loadarff(DATASETS_PATH + dataset) data = pd.DataFrame(data[0]) target = data["class"].values # Data preprocessing (type transformation) if target.dtype == object: le.fit(target) target = le.transform(target) attrs = data.drop("class", axis=1) if np.any(attrs.dtypes == object): ct = compose.ColumnTransformer(transformers=[("encoder", enc, attrs.dtypes == object)], remainder="passthrough") attrs = ct.fit_transform(attrs) attrs = np.array(attrs) injector = injector(attrs, target, rate) injector.generate() filter = filter() filter = filter(attrs, np.ravel(injector.labels.values)) real_values = [ 1 if indx in injector.noise_indx else 0 for indx in range(len(target)) ] pred_values = [ 1 if indx in filter.rem_indx else 0 for indx in range(len(target)) ] return [ dataset, metrics.f1_score(real_values, pred_values, average="micro") ]
def _get_model(db, logger): """ Create prediction model. The model is defined as a two-step pipeline: - one-hot encoder for city, hour, day_of_week and country features, - and a simple neural network for regression. :param gpudb.GPUdb db: Kinetica DB connection :rtype: (int, pipeline.Pipeline, int) """ model_records = db.get_records_and_decode( table_name='prediction_model', offset=0, limit=1, options={'sort_by': 'created_on', 'sort_order': 'descending'}) if len(model_records['records']) > 0: logger.info('Model found in DB') model = model_records['records'][0] classifier = pickle.loads(model['dump']) return model['model_id'], classifier, model['created_on'] else: logger.info('No model found in the DB, creating new one from scratch') column_transformer = compose.ColumnTransformer([ ('oh', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['city', 'hour', 'day_of_week', 'country']), ('do_nothing', preprocessing.MinMaxScaler(), ['group_members', 'group_events']) ]) classifier = neural_network.MLPRegressor(hidden_layer_sizes=(1500, 750, 375), max_iter=1000, shuffle=True) return 0, (column_transformer, classifier), None
def __init__(self, num_features, cat_features): self.num_features = num_features self.cat_features = cat_features self.data = None self.fit_flag = False self.num_preprocessing = pipeline.Pipeline(steps = [ ('num', impute.SimpleImputer(strategy = 'mean')) ]) self.cat_preprocessing_for_catboost = pipeline.Pipeline(steps = [ ('cat_impute', impute.SimpleImputer(strategy = "constant")) ]) # трансформер для заполнения пропусков и преобразования вещественных признаков self.features_for_catboost = compose.ColumnTransformer(transformers = [ ('num_features', self.num_preprocessing, self.num_features), ('cat_features', self.cat_preprocessing_for_catboost, self.cat_features) ]) # итоговый pipeline для предобработки данных self.all_features = pipeline.Pipeline(steps = [ ('feature', self.features_for_catboost), ('data', DataForCatboost(self.num_features, self.cat_features)) ])
def one_hot_encoder_column_transformer(columns): """ transformer that stacks outputs of one-hot encoders for specified columns """ return compose.ColumnTransformer([ (col, preprocessing.OneHotEncoder(), [col]) for col in columns ])
def icu_preprocessing(mfunc): return lambda **kwargs: pipeline.Pipeline([ ('fillna', compose.ColumnTransformer([('nanstring', impute.SimpleImputer(strategy='constant', fill_value='NaN'), ['admitdiagnosis'])], remainder='passthrough')), # Have to hackily encode column as 0 on second transformer bc columntransformer throws out Pandas info ('ohe', compose.ColumnTransformer([ ('onehot', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'), [0]) ], remainder='passthrough')), ('impute', impute.SimpleImputer()), ('scale', preprocessing.StandardScaler()), ('model', mfunc(**kwargs)) ])
def preprocessor(num_feats, cat_feats): num_preprocessing = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer( strategy='median')), ('encoder', preprocessing.StandardScaler())]) cat_preporcessing = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing') ), ('encoder', preprocessing.OrdinalEncoder())]) return compose.ColumnTransformer( transformers=[('num', num_preprocessing, num_feats), ('cat', cat_preporcessing, cat_feats)])
def compose(self): inputs = self.space.get_inputs(self) assert all([ isinstance(m, PipelineOutput) for m in inputs ]), 'The upstream module of `ColumnTransformer` must be `Pipeline`.' transformers = [] next = None for p in inputs: next, (pipeline_name, transformer) = p.compose() transformers.append((p.pipeline_name, transformer, p.columns)) pv = self.param_values ct = compose.ColumnTransformer(transformers, **pv) return next, (self.name, ct)
def make_preprocessing_step(features, numeric_transformer, categorical_transformer): nominal_features = features["nominal"] numeric_features = features["numeric"] step = ("PREP", compose.ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, nominal_features)])) return step
def generate_model(pred_vars, log_transform=True, one_hot_week=False, method="lm"): """ Generate the model for transforming and predicting. ... """ assert method in ['lm', 'poisson'], "method must be one of 'lm' or 'poisson'" if log_transform: ft = preprocessing.FunctionTransformer(np.log) else: ft = preprocessing.FunctionTransformer() if one_hot_week: model_prep = compose.ColumnTransformer( [("onehot_categorical", preprocessing.OneHotEncoder(), ["week_num"]), ("num_scaler", ft, pred_vars)], remainder="drop", ) else: model_prep = compose.ColumnTransformer( [("num_scaler", ft, pred_vars + ['ca_prop'])], remainder="drop", ) if method == 'lm': pipe = pipeline.Pipeline([("preprocessor", model_prep), ("regressor", linear_model.LinearRegression())]) elif method == 'poisson': pipe = pipeline.Pipeline([ ("preprocessor", model_prep), ("regressor", linear_model.PoissonRegressor(alpha=1e-12, max_iter=10000)) ]) return pipe
def getTransformer(self, **params): # numvars = ["blood_pressure", "cholestoral", "max_heart_rate", "age"] # cateVars = ["cp", "thal"] ct = compose.ColumnTransformer( [ # ("norm", preprocessing.StandardScaler(), self._getIndex(numvars)), # ( # "cate", # preprocessing.OneHotEncoder(handle_unknown="ignore"), # self.getIndex(cateVars), # ), ], remainder="passthrough", ) transformer = pipeline.Pipeline([("norm", ct)]) transformer.set_params(**params) return transformer
def get_preprocess_pipeline(feature_columns, categorical_names, numerical_names): """ Creates the preprocessor used to process the data for training. This will be combined with the estimator Returns ------- Preprocessor """ numeric_transformer = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy='median')), ('scaler', preprocessing.StandardScaler()), ]) categorical_transformer = pipeline.Pipeline([ ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)), ]) feature_columns = metadata.FEATURE_COLUMNS numerical_names = metadata.NUMERIC_FEATURES categorical_names = metadata.CATEGORICAL_FEATURES boolean_mask = functools.partial(utils.boolean_mask, feature_columns) numerical_boolean = boolean_mask(numerical_names) categorical_boolean = boolean_mask(categorical_names) transform_list = [] if any(numerical_boolean): transform_list.extend([ ('numeric', numeric_transformer, numerical_boolean), ]) if any(categorical_boolean): transform_list.extend([ ('categorical', categorical_transformer, categorical_boolean), ]) preprocessor = compose.ColumnTransformer(transform_list) return preprocessor
def _get_preprocessor( num_features: List[str], cat_features: List[str] ) -> pipeline.Pipeline: num_transformer = pipeline.Pipeline([ ("scale", preprocessing.StandardScaler()), ("impute", impute.KNNImputer(n_neighbors = 10)), ]) cat_transformer = pipeline.Pipeline([ ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")), ("encode", preprocessing.OneHotEncoder(drop = "first")), ] ) preprocessor = compose.ColumnTransformer( [("num", num_transformer, num_features), ("cat", cat_transformer, cat_features) ]) return preprocessor
def prepare_training_data(df_dict, target, remove, date_fields): df_dict_target = {} for df_name, df in df_dict.items(): try: df_dict_target[df_name] = df[target] except KeyError: df_dict_target[df_name] = None df = remove_fields(add_date_features(df, date_col_list=date_fields), target, remove) df['df_id'] = df_name df_dict[df_name] = df df_master = pd.concat(df_dict.values(), axis=0, sort=False) one_hot_list = [] scaler_list = [] for idx, col in enumerate(df_master.columns): d_type = type(df_master[col].iloc[0]) if col not in ['df_id']: if d_type in [str]: df_master[col] = df_master[col].fillna('N/A') one_hot_list.append(idx) elif d_type not in [np.datetime64, pd.Timestamp]: df_master[col] = df_master[col].fillna(df_master[col].median()) scaler_list.append(idx) for df_name, df in df_dict.items(): df_dict[df_name] = df_master.loc[df_master['df_id'] == df_name].drop( 'df_id', axis=1) df_master = df_master.drop('df_id', axis=1) ct = compose.ColumnTransformer(transformers=[ ('one_hot_1', preprocessing.OneHotEncoder(sparse=False), one_hot_list), ('scaler_1', preprocessing.StandardScaler(), scaler_list) ], sparse_threshold=0).fit(df_master, y=target) for df_name, df in df_dict.items(): df_dict[df_name] = xgb.DMatrix(data=ct.transform(df), missing=np.nan) if df_dict_target[df_name] is not None: df_dict[df_name].set_label(df_dict_target[df_name]) return df_dict
def __init__(self, num_features, cat_features): self.num_features = num_features self.cat_features = cat_features self.data = None self.fit_flag = False # pipeline for numeric features self.num_preprocessing = pipeline.Pipeline(steps = [ ('num', impute.SimpleImputer(strategy = 'mean')), # strategy = 'constant', fill_value = 0 ('num_scaler', preprocessing.StandardScaler()) ]) # pipeline for numeric features self.cat_preprocessing = pipeline.Pipeline(steps = [ ('cat', impute.SimpleImputer(strategy = 'constant')), # 'most_frequent' ('cat_encoder', preprocessing.OneHotEncoder(handle_unknown = 'ignore', sparse = False)) ]) # transformer for impute NaN and preprocessing features self.data_preprocessing = compose.ColumnTransformer(transformers = [ ('num_features', self.num_preprocessing, self.num_features), ('cat_features', self.cat_preprocessing, self.cat_features) ])
def create_pipeline(num_feat, cat_feat, cfg): """ Create and return the model classification pipeline with encoding and imputation of feature and model :param num_feat: list numerical features name list :param cat_feat: list categorical features name list :param cfg: class custom configuration class :return: sklearn.pipeline.Pipeline model pipeline """ cat_pipeline = make_pipeline( impute.SimpleImputer(strategy='constant', fill_value='NaN'), preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')) pre_process_pipeline = make_pipeline( transformers.ColumnSelector(columns=cfg.features), compose.ColumnTransformer(transformers=[ ('num_feat', impute.SimpleImputer(strategy='constant', fill_value=cfg.num_imputer), num_feat), ('cat_feat', cat_pipeline, cat_feat), ]), ) pipeline = Pipeline( steps=[('preproc', pre_process_pipeline ), ('xgb', xgb.XGBClassifier(objective='binary:logistic'))]) return pipeline
dataset = openml.datasets.get_dataset(68) X, y, categorical_indicator, attribute_names = dataset.get_data( dataset_format="array", target=dataset.default_target_attribute) clf = neighbors.KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) ############################################################################ # You can also ask for meta-data to automatically preprocess the data. # # * e.g. categorical features -> do feature encoding dataset = openml.datasets.get_dataset(17) X, y, categorical_indicator, attribute_names = dataset.get_data( dataset_format="array", target=dataset.default_target_attribute) print(f"Categorical features: {categorical_indicator}") transformer = compose.ColumnTransformer([ ("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator) ]) X = transformer.fit_transform(X) clf.fit(X, y) ############################################################################ # Runs: Easily explore models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We can run (many) scikit-learn algorithms on (many) OpenML tasks. # Get a task task = openml.tasks.get_task(403) # Build any classifier or pipeline clf = tree.ExtraTreeClassifier()
y, test_size=0.33, random_state=0) # Cross-validation. k = 3 cvsplitter = ms.KFold(n_splits=k, shuffle=True, random_state=0) # Apply a transformation for each column. transformers = list() transformers.append(('StandardScaler', pp.StandardScaler(), idxnumerics)) transformers.append( ('OneHotEncoder', pp.OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore'), idxnonnumerics)) ct = sc.ColumnTransformer(transformers, remainder='passthrough') ct.fit(Xtrain) Xtrain_transformed = ct.transform(Xtrain) print('Feature Names: {0}'.format(ct.get_feature_names_out())) # Use the transformer in a pipeline. estimators = list() estimators.append(('ColumnTransformer', sc.ColumnTransformer(transformers, remainder='passthrough'))) estimators.append(('RandomForestClassifier', ensemble.RandomForestClassifier(n_estimators=100, max_features=3))) ppl = pl.Pipeline(estimators) accuracy = ms.cross_val_score(ppl, Xtrain, ytrain, cv=cvsplitter) print('Accuracy of pipeline: {0:.2f}'.format(accuracy.mean()))
"mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen" ] glucose_attr = [ "mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose" ] vztahy_attr = ["relationship", "marital-status"] work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"] edu_attr = ["education", "education-num"] impute_col_transf = compose.ColumnTransformer(transformers=[ ("oxygen_n_glucose_impute", KeepDataFrame(impute.IterativeImputer(max_iter=50)), oxygen_attr + glucose_attr ), ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr), ("work_impute", CustomCatImputing(imputer_type="knn"), work_attr), ("edu_impute", CustomCatImputing(imputer_type="knn"), edu_attr), ("sex_impute", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), ["sex"]), ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"]) ]) #tento column transformer sa bude pouzivat v pripade, kedy chceme pouzit v ramci celeho datasetu cisto len simpleimputer most_freq_attr = ["sex"] + edu_attr + work_attr + vztahy_attr mean_attr = ["age"] + oxygen_attr + glucose_attr simple_impute_col_transf = compose.ColumnTransformer(transformers=[( "simple_impute_cat", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), most_freq_attr ), ("simple_impute_num", KeepDataFrame(impute.SimpleImputer()), mean_attr)])
) for rep in range(REPETITIONS): results = [] for dataset in datasets: dataset_info = arff_io.loadarff(config["dataset"]["folder"] + dataset + ".arff") dataset_info = pd.DataFrame(dataset_info[0]) target = dataset_info["class"].values # Data preprocessing (type transformation) if target.dtype == object: le.fit(target) target = le.transform(target) attrs_ = dataset_info.drop("class", axis=1) if np.any(attrs_.dtypes == object): ct = compose.ColumnTransformer(transformers=[ ("encoder", enc, attrs_.dtypes == object) ], remainder="passthrough") attrs_ = ct.fit_transform(attrs_) try: attrs = attrs_.toarray() except AttributeError: attrs = np.array(attrs_) X_train, X_test, y_train, y_test = train_test_split(attrs, target, test_size=0.2) automl.fit(X_train, y_train, dataset_name=dataset) try: steps = automl.get_models_with_weights()[0][1].named_steps results.append({ "dataset": dataset,
] x_train = train_users[vars] x_val = test_users[vars] y_train = train_users['target'].map({"Bus":2, "Car":2, "Still": 4, "Train":1, "Walking": 3}) y_val = test_users['target'].map({"Bus":2, "Car":2, "Still": 4, "Train":1, "Walking": 3}) ########################################################################################################################################## ## Strategy for missing data - fill with 0 num_pipe_tree = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy="constant", fill_value=0)),#fill_value=0 #('Scaler',StandardScaler()), ]) tree_pipe = compose.ColumnTransformer(transformers=[ #('cats', cat_pipe, cat_vars1), ('nums0', num_pipe_tree, vars)], #('numsM', num_pipe_nanM, num_vars_nanM)], remainder='drop' ) tree_pipes = {model_name: pipeline.make_pipeline(tree_pipe, model) for model_name, model in tree_classifiers.items()} results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], "Adjusted Acc":[],"Adjusted Bal Acc": [], "grouped Acc":[], "Grouped Bal Acc":[], 'Time': []}) print(x_train.isna().sum()) for model_name, model in tree_pipes.items(): print(f"Working on: {model_name}") print(model) start_time = time.time() model.fit(x_train, y_train) if model_name == "VotingClassifier": pickle.dump(model,open("SavedModels/2best_model.pickle", 'wb')) pred = model.predict(x_val)
categorical_pipeline = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) ]) #build pipeline for numerical features numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())]) #build preprocessing pipeline for all features cat_features = utils.get_non_continuous_features(house_train1) num_features = utils.get_continuous_features(house_train1) preprocess_pipeline = compose.ColumnTransformer([ ('cat', categorical_pipeline, cat_features), ('num', numerical_pipeline, num_features) ]) #build complete pipeline with feature selection and ml algorithms complete_pipeline = pipeline.Pipeline([ ('preprocess', preprocess_pipeline), ('zv_filter', feature_selection.VarianceThreshold()), ('feature_selector', feature_selection.SelectFromModel(linear_model.Lasso())), ('pca', decomposition.PCA()), ('regressor', neighbors.KNeighborsRegressor()) ]) pipeline_grid = { 'preprocess__num__imputer__strategy': ['mean', 'median'], 'pca__n_components': [0.90, 0.95], 'regressor__n_neighbors': list(range(5, 15))
] num_features = list(df.drop(cat_features + targets, axis=1).columns) # The distribution of Y1 and Y2 are both normal. Split data randomly. train, test = enrich.split_train_test_rand(df, 0.2, 123) # Define pipelines numeric_transformer = pipe.Pipeline(steps=[('scaler', skp.StandardScaler())]) categorical_transformer = pipe.Pipeline( steps=[('onehot', ce.OneHotEncoder(cols=cat_features, drop_invariant=True))]) # Create full transformation, including both pipelines full_transformer = compose.ColumnTransformer([ ("cat", categorical_transformer, cat_features), ("num", numeric_transformer, num_features) ]) # Prepare the data by fitting the full pipeline to the training data, and transforming it # N.B. You must cast this back to DataFrame, because the return value is of type numpy array oh_names = [ "CatVar0_1", "CatVar0_2", "CatVar0_3", "CatVar0_4", "CatVar1_1", "CatVar1_2", "CatVar1_3", "CatVar1_4", "CatVar2_1", "CatVar2_2", "CatVar3_1", "CatVar3_2", "CatVar3_3", "CatVar4_1", "CatVar4_2", "CatVar4_3", "CatVar5_1", "CatVar5_2", "CatVar5_3", "CatVar5_4", "CatVar6_1", "CatVar6_2", "CatVar6_3", "CatVar6_4", "CatVar7_1", "CatVar7_2", "CatVar7_3" ] # drop targets prior to transformation with pipeline x_train = train.drop(targets, axis=1)
Returns preprocessing pipeline adapted to specified numerical and categorical features """ num_transformer = pipeline.Pipeline([ ("scale", preprocessing.StandardScaler()), ("impute", impute.KNNImputer(n_neighbors = 10)), ]) cat_transformer = pipeline.Pipeline([ ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")), ("encode", preprocessing.OneHotEncoder(drop = "first")), ] ) preprocessor = compose.ColumnTransformer( [("num", num_transformer, num_features), ("cat", cat_transformer, cat_features) ]) return preprocessor def get_lr_model( num_features: List[str], cat_features: List[str], C: float = 1.0 ) -> pipeline.Pipeline: """ Returns full pipeline for a logistic regression model with specified numerical and categorical features. """ model = pipeline.Pipeline([ ("pre", _get_preprocessor(num_features, cat_features)), ("model", multioutput.MultiOutputClassifier( linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga")
def fit(self): if self.fit_first: self.full_X = np.concatenate(self.collect_X, axis=0).astype(np.float32) self.full_y = np.concatenate(self.collect_y).astype(np.float32) self.collect_X = [] self.collect_y = [] gc.collect() self.full_X_val = np.concatenate(self.collect_X_val, axis=0).astype(np.float32) self.full_y_val = np.concatenate(self.collect_y_val).astype( np.float32) self.collect_X_val = [] self.collect_y_val = [] gc.collect() self.fit_first = False # In case we are MLP or LogReg transform the matrices if self.logreg_mode or self.mlp_mode: self.imputer_1 = skimpute.SimpleImputer(strategy="mean") self.imputer_2 = skimpute.SimpleImputer(strategy="mean") self.imputer_3 = skimpute.SimpleImputer(strategy="mean") self.imputer_cat_1 = skimpute.SimpleImputer( strategy="most_frequent") self.imputer_cat_2 = skimpute.SimpleImputer( strategy="most_frequent") self.scaler_1 = skpproc.StandardScaler() self.scaler_2 = skpproc.StandardScaler() self.scaler_3 = skpproc.StandardScaler() self.cat_encoder_1 = skpproc.OneHotEncoder( categories="auto", sparse=False, handle_unknown="ignore") self.cat_encoder_2 = skpproc.OneHotEncoder( categories="auto", sparse=False, handle_unknown="ignore") # Gather indices of the categorical columns cat_col_idxs = [] print("Number of cat cols: {}".format(len(self.X_cat_cols))) for cat_colname in self.X_cat_cols: cat_col_idxs.append(self.X_col_names.index(cat_colname)) assert (self.full_X.shape[1] == 500) sorted_cat_cols = list(sorted(cat_col_idxs)) if len(sorted_cat_cols) == 0: self.composed_imputer = self.imputer_1 self.composed_scaler_encoder = self.scaler_1 elif sorted_cat_cols[1] - sorted_cat_cols[0] == 1: lidx = sorted_cat_cols[0] ridx = sorted_cat_cols[1] self.composed_imputer = skcompose.ColumnTransformer( [("cont_impute_1", self.imputer_1, np.arange(lidx)), ("cat_impute_1", self.imputer_cat_1, [lidx, ridx]), ("cont_impute_2", self.imputer_2, np.arange(ridx + 1, self.full_X.shape[1]))], sparse_threshold=0) self.composed_scaler_encoder = skcompose.ColumnTransformer( [("cont_scale_1", self.scaler_1, np.arange(lidx)), ("cat_encode_1", self.cat_encoder_1, [lidx, ridx]), ("cont_scale_3", self.scaler_2, np.arange(ridx + 1, self.full_X.shape[1]))], sparse_threshold=0) else: lidx = sorted_cat_cols[0] ridx = sorted_cat_cols[1] self.composed_imputer = skcompose.ColumnTransformer( [("cont_impute_1", self.imputer_1, np.arange(lidx)), ("cat_impute_1", self.imputer_cat_1, [lidx]), ("cont_impute_2", self.imputer_2, np.arange(lidx + 1, ridx)), ("cat_impute_2", self.imputer_cat_2, [ridx]), ("cont_impute_3", self.imputer_3, np.arange(ridx + 1, self.full_X.shape[1]))], sparse_threshold=0) self.composed_scaler_encoder = skcompose.ColumnTransformer( [("cont_scale_1", self.scaler_1, np.arange(lidx)), ("cat_encode_1", self.cat_encoder_1, [lidx]), ("cont_scale_2", self.scaler_2, np.arange(lidx + 1, ridx)), ("cat_encode_2", self.cat_encoder_2, [ridx]), ("cont_scale_3", self.scaler_3, np.arange(ridx + 1, self.full_X.shape[1]))], sparse_threshold=0) self.full_X[~np.isfinite(self.full_X)] = np.nan self.full_X_val[~np.isfinite(self.full_X_val)] = np.nan self.full_X = self.composed_imputer.fit_transform(self.full_X) self.full_X = self.composed_scaler_encoder.fit_transform( self.full_X) self.full_X_val = self.composed_imputer.transform( self.full_X_val) self.full_X_val = self.composed_scaler_encoder.transform( self.full_X_val) if self.verbose: print("Training matrix dimension: {}x{}".format( self.full_X.shape[0], self.full_X.shape[1]), flush=True) print("Validation matrix dimension: {}x{}".format( self.full_X_val.shape[0], self.full_X_val.shape[1]), flush=True) if self.univariate_test: Fstat, _ = skfselect.f_classif(self.full_X, self.full_y) sort_idx = list(np.argsort(Fstat))[::-1] with open("./features_F_scores.tsv", 'w') as fp: print("feat_name\tFscore", file=fp) for jdx in sort_idx: print("{}\t{}".format(self.X_col_names[jdx], Fstat[jdx]), file=fp) sys.exit(0) if self.select_features_forward: n_vars_to_select = 21 selected_vars = [] search_vars = [ 136, 146, 5, "RelDatetime", "Age", 1, 41, 42, 43, 44, 13, 28, 172, 174, 176, 4, 62, 3, 20, 87, 23 ] assert (len(search_vars) == n_vars_to_select) while len(selected_vars) < n_vars_to_select: best_score_round = -np.inf best_vid_round = None for idx, vid in enumerate(search_vars): probe_set = selected_vars + [vid] selected_idxs = self.derived_feature_set(probe_set) der_X = self.full_X[:, selected_idxs] der_X_val = self.full_X_val[:, selected_idxs] derived_names = [ self.X_col_names[jdx] for jdx in selected_idxs ] derived_cat_cols = list( set(derived_names).intersection(set(self.X_cat_cols))) try: self.ml_model.fit(der_X, self.full_y, feature_name=derived_names, eval_set=[(der_X_val, self.full_y_val)], early_stopping_rounds=20, verbose=False, categorical_feature=derived_cat_cols, eval_metric=custom_auprc_metric) except: print("Degenerate variable set: Skipping...") continue metrics = self.get_validation_scores( red_idxs=selected_idxs) current_auprc = metrics["auprc"] if current_auprc > best_score_round: best_vid_round = vid best_score_round = current_auprc selected_vars.append(best_vid_round) search_vars.remove(best_vid_round) print("Feature selection round {}/30 DONE".format( len(selected_vars))) print("Added variable {}, New score AUPRC={:.3f}".format( best_vid_round, best_score_round)) print("Feature selection finalized...") sys.exit(0) if self.select_features_backward: search_vars = [ 136, 146, 60, 5, 41, 42, 43, 44, 39, 40, 45, 66, 12, 152, 20, 72, 15, 64, 65, 160, 1, 168, 135, 61, 14, "PatGroup", "Age", "Height", "Surgical", "RelDatetime" ] selected_vars = search_vars.copy() while len(selected_vars) > 0: best_score_round = -np.inf best_vid_round = None for idx, vid in enumerate(selected_vars): probe_set = selected_vars.copy() probe_set.remove(vid) selected_idxs = self.derived_feature_set(probe_set) der_X = self.full_X[:, selected_idxs] der_X_val = self.full_X_val[:, selected_idxs] derived_names = [ self.X_col_names[jdx] for jdx in selected_idxs ] derived_cat_cols = list( set(derived_names).intersection(set(self.X_cat_cols))) try: self.ml_model.fit(der_X, self.full_y, feature_name=derived_names, eval_set=[(der_X_val, self.full_y_val)], early_stopping_rounds=10, verbose=False, categorical_feature=derived_cat_cols, eval_metric="auc") except: print("Degenerate variable set: Skipping...") continue metrics = self.get_validation_scores( red_idxs=selected_idxs) current_auprc = metrics["auprc"] if current_auprc > best_score_round: best_vid_round = vid best_score_round = current_auprc selected_vars.remove(best_vid_round) print("Feature selection round {}/30 DONE".format( len(search_vars) - len(selected_vars))) print("Removed variable {}, New score AUPRC={:.3f}".format( best_vid_round, best_score_round)) print("Feature selection finalized...") sys.exit(0) if not self.use_xgboost and not self.use_catboost and not self.decision_tree_mode and not self.logreg_mode and not self.mlp_mode: self.ml_model.set_params(**{"metric": 'None'}) cat_idxs = [] for cidx, feat_name in enumerate(self.X_col_names): if feat_name in self.X_cat_cols: cat_idxs.append(cidx) if self.use_xgboost: self.ml_model.fit(self.full_X, self.full_y, eval_set=[(self.full_X_val, self.full_y_val)], eval_metric="logloss", early_stopping_rounds=50, verbose=False) elif self.use_catboost: catboost_X = pd.DataFrame(self.full_X, columns=self.X_col_names) catboost_Xval = pd.DataFrame(self.full_X_val, columns=self.X_col_names) for cat_col in self.X_cat_cols: catboost_X[cat_col] = catboost_X[cat_col].astype(str) catboost_Xval[cat_col] = catboost_Xval[cat_col].astype(str) self.ml_model.fit(catboost_X, self.full_y, eval_set=[(catboost_Xval, self.full_y_val)], cat_features=cat_idxs, silent=True, early_stopping_rounds=50) else: if self.decision_tree_mode: self.ml_model.fit(self.full_X, self.full_y, feature_name=self.X_col_names, categorical_feature=self.X_cat_cols, eval_set=[(self.full_X_val, self.full_y_val) ], eval_metric=custom_auprc_metric, verbose=False) elif self.logreg_mode: self.ml_model.fit(self.full_X, self.full_y) elif self.mlp_mode: self.ml_model.fit(self.full_X, self.full_y) else: self.ml_model.fit(self.full_X, self.full_y, feature_name=self.X_col_names, categorical_feature=self.X_cat_cols, eval_set=[(self.full_X_val, self.full_y_val) ], eval_metric=custom_auprc_metric, early_stopping_rounds=50, verbose=False)
cat_mult_pipeline = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')), ('One Hot', OneHotEncoder(handle_unknown="ignore")), ]) num_mult_pipeline = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='median', )), #('Scaler',StandardScaler()), #('Quantile Transform',QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('Yeo-Johnson', PowerTransformer(method='yeo-johnson')), #('Box-Cox', PowerTransformer(method='box-cox')), # all values must be greater than 0 #('Scaler',StandardScaler), ]) mult_prepro = compose.ColumnTransformer(transformers=[ ('num', num_mult_pipeline, num_vars), ('cat', cat_mult_pipeline, cat_vars), ], remainder='drop') #### PIPELINES FOR TREES num_tree_pipeline = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='mean')), ]) cat_tree_pipeline =pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='constant',fill_value='missing')), ('ordinal', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)), ]) tree_prepro = compose.ColumnTransformer(transformers=[ ('num', num_tree_pipeline, num_vars), ('cat', cat_tree_pipeline, cat_vars),
# Build pipelines for categorical data and numeric data cat_pipe = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore')) ]) num_pipe = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy='median')), ('poly', preprocessing.PolynomialFeatures(degree=2, include_bias=False)), ('scaler', preprocessing.StandardScaler()), ]) preprocessing_pipe = compose.ColumnTransformer([("cat", cat_pipe, cat_columns), ("num", num_pipe, num_columns) ]) # Build estimator pipeline estimator_pipe = pipeline.Pipeline([("preprocessing", preprocessing_pipe), ("est", linear_model.ElasticNet(random_state=1))]) # Parameter grid to tune hyper parameters param_grid = { "est__alpha": 0.0 + np.random.random(10) * 0.02, "est__l1_ratio": np.linspace(0.0001, 1, 20), } # Grid Search estimator gsearch = model_selection.GridSearchCV(estimator_pipe,
lencoder.fit(X_train[i]) X_train[i] = lencoder.transform(X_train[i]) cat_feature_pipeline = pipeline.Pipeline([ ('imputation', impute.SimpleImputer(strategy="most_frequent")), #('label',preprocessing.LabelEncoder()) ]) #transformed_data=cat_feature_pipeline.fit_transform(X_train[['ENRL_CERT_NBR']]) num_feature_pipeline = pipeline.Pipeline([ ('imputation', impute.SimpleImputer()), ('standardscalar', preprocessing.StandardScaler()) ]) #transformed_data=num_feature_pipeline.fit_transform(X_train[['TOT_BLNG_AMT']]) feature_preprocessing = compose.ColumnTransformer( [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list), ('num_feature_pipeline', num_feature_pipeline, num_features_list)], n_jobs=10) features_pipeline = pipeline.FeatureUnion( [('pca_selector', decomposition.PCA(n_components=0.90)), ('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))], n_jobs=20) classifier = tree.DecisionTreeClassifier() #build complete pipeline with feature selection and ml algorithms complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing), ('zv_filter', feature_selection.VarianceThreshold()), ('features', features_pipeline), ('tree', classifier)])
def factorization_machine_column_transformer(columns): return compose.ColumnTransformer([ (col, preprocessing.OneHotEncoder(), [col]) for col in columns ])
def preprocess(self): """ Preprocesses the data according to specified demands and for the classifiers :return: None """ # Display current operation # print(" Reading csv, dropping excluded columns, movie duplicates and rows with na values...") # import csv data = pd.read_csv(self.file, delimiter=',') # save all Attributes excluding content_Rating, movie_imdb_link, plot_keywords data.drop( columns=['content_rating', 'movie_imdb_link', 'plot_keywords'], inplace=True) # discard entries with any NaN value data.dropna(inplace=True) # Handle duplicate movie_tile values data.drop_duplicates(subset='movie_title', keep='first', inplace=True) # As movie title is now unique we can discard it data.drop(columns=['movie_title'], inplace=True) # Utilize the fact that data is not normally distributed data['index1'] = data.index # saves imdb score as labels & Discard label from data self.y = data.pop('imdb_score') # Display current operation # print(" Turning genres column and the 3 actors to dummy variables...") # Turn into dummy variables and discard original column from data genres = data.pop('genres').str.get_dummies() # Merge the 3 actors into one column & delete original columns from data & Turn into dummy variables actors = (data.pop('actor_1_name') + "|" + data.pop('actor_2_name') + "|" + data.pop('actor_3_name')).str.get_dummies() # Create column lists for transformer numerical_cols = data.select_dtypes(include='number').columns category_cols = data.select_dtypes(exclude='number').columns # Convert numerical columns int64 to float64 data[numerical_cols] = data[numerical_cols].astype('float64') # After creating the column lists - joins back the dummy-variable actors and genres data = data.join(actors) data = data.join(genres) # Display current operation # print(" Applying Standard Scaler to numerical columns and OneHotEncoder for remaining categorical columns...") preprocessor = compose.ColumnTransformer(transformers=[ ('num', preprocessing.StandardScaler(), numerical_cols), ('cat', preprocessing.OneHotEncoder(), category_cols) ], remainder="passthrough") self.X = preprocessor.fit_transform(data) # Display current operation # print(" Binarizing Labels...") # all labels lower that 7 become 0, 7 and higher become 1 self.y = preprocessing.Binarizer(GOODMOVIETHRESHOLD).fit_transform( self.y.to_numpy().reshape(-1, 1)) self.y = np.ravel(self.y)
print('pipeline start') train_file_a = os.path.join(str(get_project_root()), "experiments", "user_interviews", "adult_simple_train_a.csv") raw_data_a = pd.read_csv(train_file_a, na_values='?', index_col=0) train_file_b = os.path.join(str(get_project_root()), "experiments", "user_interviews", "adult_simple_train_b.csv") raw_data_b = pd.read_csv(train_file_b, na_values='?', index_col=0) merged_raw_data = raw_data_a.merge(raw_data_b, on="id") data = merged_raw_data.dropna() labels = preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K']) column_transformer = compose.ColumnTransformer( transformers=[('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['education', 'workclass']), ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])]) adult_income_pipeline = pipeline.Pipeline([('features', column_transformer), ('classifier', tree.DecisionTreeClassifier())]) adult_income_pipeline.fit(data, labels) print('pipeline finished')