Ejemplo n.º 1
0
def calculate_filter_f1(dataset, filter, injector, rate=0.1):
    # Reading dataset
    if dataset.endswith("json"):
        data = pd.read_json(DATASETS_PATH + dataset)
    elif dataset.endswith("arff"):
        data = arff_io.loadarff(DATASETS_PATH + dataset)
        data = pd.DataFrame(data[0])
    target = data["class"].values
    # Data preprocessing (type transformation)
    if target.dtype == object:
        le.fit(target)
        target = le.transform(target)
    attrs = data.drop("class", axis=1)
    if np.any(attrs.dtypes == object):
        ct = compose.ColumnTransformer(transformers=[("encoder", enc,
                                                      attrs.dtypes == object)],
                                       remainder="passthrough")
        attrs = ct.fit_transform(attrs)
    attrs = np.array(attrs)

    injector = injector(attrs, target, rate)
    injector.generate()

    filter = filter()
    filter = filter(attrs, np.ravel(injector.labels.values))
    real_values = [
        1 if indx in injector.noise_indx else 0 for indx in range(len(target))
    ]
    pred_values = [
        1 if indx in filter.rem_indx else 0 for indx in range(len(target))
    ]
    return [
        dataset,
        metrics.f1_score(real_values, pred_values, average="micro")
    ]
Ejemplo n.º 2
0
def _get_model(db, logger):
    """
    Create prediction model.

    The model is defined as a two-step pipeline:
     - one-hot encoder for city, hour, day_of_week and country features,
     - and a simple neural network for regression.

    :param gpudb.GPUdb db: Kinetica DB connection
    :rtype: (int, pipeline.Pipeline, int)
    """

    model_records = db.get_records_and_decode(
        table_name='prediction_model', offset=0, limit=1,
        options={'sort_by': 'created_on', 'sort_order': 'descending'})

    if len(model_records['records']) > 0:
        logger.info('Model found in DB')
        model = model_records['records'][0]
        classifier = pickle.loads(model['dump'])
        return model['model_id'], classifier, model['created_on']
    else:
        logger.info('No model found in the DB, creating new one from scratch')
        column_transformer = compose.ColumnTransformer([
            ('oh', preprocessing.OneHotEncoder(handle_unknown='ignore'), ['city', 'hour', 'day_of_week', 'country']),
            ('do_nothing', preprocessing.MinMaxScaler(), ['group_members', 'group_events'])
        ])
        classifier = neural_network.MLPRegressor(hidden_layer_sizes=(1500, 750, 375), max_iter=1000, shuffle=True)
        return 0, (column_transformer, classifier), None
Ejemplo n.º 3
0
 def __init__(self, num_features, cat_features):
     self.num_features = num_features
     self.cat_features = cat_features
     
     self.data = None
     self.fit_flag = False
     
     self.num_preprocessing = pipeline.Pipeline(steps = [
         ('num', impute.SimpleImputer(strategy = 'mean'))
     ])
     self.cat_preprocessing_for_catboost = pipeline.Pipeline(steps = [
         ('cat_impute', impute.SimpleImputer(strategy = "constant"))
     ])
     
     # трансформер для заполнения пропусков и преобразования вещественных признаков
     self.features_for_catboost = compose.ColumnTransformer(transformers = [
         ('num_features', self.num_preprocessing, self.num_features), 
         ('cat_features', self.cat_preprocessing_for_catboost, self.cat_features) 
     ])
     
     # итоговый pipeline для предобработки данных
     self.all_features = pipeline.Pipeline(steps = [
         ('feature', self.features_for_catboost), 
         ('data', DataForCatboost(self.num_features, self.cat_features))
     ])
Ejemplo n.º 4
0
def one_hot_encoder_column_transformer(columns):
    """
    transformer that stacks outputs of one-hot encoders for specified columns
    """
    return compose.ColumnTransformer([
        (col, preprocessing.OneHotEncoder(), [col]) for col in columns
    ])
Ejemplo n.º 5
0
def icu_preprocessing(mfunc):
    return lambda **kwargs: pipeline.Pipeline([
        ('fillna',
         compose.ColumnTransformer([('nanstring',
                                     impute.SimpleImputer(strategy='constant',
                                                          fill_value='NaN'),
                                     ['admitdiagnosis'])],
                                   remainder='passthrough')),
        # Have to hackily encode column as 0 on second transformer bc columntransformer throws out Pandas info
        ('ohe',
         compose.ColumnTransformer([
             ('onehot',
              preprocessing.OneHotEncoder(sparse=False,
                                          handle_unknown='ignore'), [0])
         ],
                                   remainder='passthrough')),
        ('impute', impute.SimpleImputer()),
        ('scale', preprocessing.StandardScaler()),
        ('model', mfunc(**kwargs))
    ])
Ejemplo n.º 6
0
def preprocessor(num_feats, cat_feats):
    num_preprocessing = pipeline.Pipeline(
        steps=[('imputer', impute.SimpleImputer(
            strategy='median')), ('encoder', preprocessing.StandardScaler())])

    cat_preporcessing = pipeline.Pipeline(
        steps=[('imputer',
                impute.SimpleImputer(strategy='constant', fill_value='missing')
                ), ('encoder', preprocessing.OrdinalEncoder())])

    return compose.ColumnTransformer(
        transformers=[('num', num_preprocessing,
                       num_feats), ('cat', cat_preporcessing, cat_feats)])
Ejemplo n.º 7
0
    def compose(self):
        inputs = self.space.get_inputs(self)
        assert all([
            isinstance(m, PipelineOutput) for m in inputs
        ]), 'The upstream module of `ColumnTransformer` must be `Pipeline`.'
        transformers = []
        next = None
        for p in inputs:
            next, (pipeline_name, transformer) = p.compose()
            transformers.append((p.pipeline_name, transformer, p.columns))

        pv = self.param_values
        ct = compose.ColumnTransformer(transformers, **pv)
        return next, (self.name, ct)
Ejemplo n.º 8
0
def make_preprocessing_step(features, numeric_transformer,
                            categorical_transformer):

    nominal_features = features["nominal"]
    numeric_features = features["numeric"]

    step = ("PREP",
            compose.ColumnTransformer(transformers=[('num',
                                                     numeric_transformer,
                                                     numeric_features),
                                                    ('cat',
                                                     categorical_transformer,
                                                     nominal_features)]))
    return step
Ejemplo n.º 9
0
def generate_model(pred_vars,
                   log_transform=True,
                   one_hot_week=False,
                   method="lm"):
    """
    Generate the model for transforming and predicting.
    ...
    """
    assert method in ['lm',
                      'poisson'], "method must be one of 'lm' or 'poisson'"
    if log_transform:
        ft = preprocessing.FunctionTransformer(np.log)
    else:
        ft = preprocessing.FunctionTransformer()

    if one_hot_week:
        model_prep = compose.ColumnTransformer(
            [("onehot_categorical", preprocessing.OneHotEncoder(),
              ["week_num"]), ("num_scaler", ft, pred_vars)],
            remainder="drop",
        )
    else:
        model_prep = compose.ColumnTransformer(
            [("num_scaler", ft, pred_vars + ['ca_prop'])],
            remainder="drop",
        )
    if method == 'lm':
        pipe = pipeline.Pipeline([("preprocessor", model_prep),
                                  ("regressor",
                                   linear_model.LinearRegression())])
    elif method == 'poisson':
        pipe = pipeline.Pipeline([
            ("preprocessor", model_prep),
            ("regressor",
             linear_model.PoissonRegressor(alpha=1e-12, max_iter=10000))
        ])
    return pipe
Ejemplo n.º 10
0
 def getTransformer(self, **params):
     # numvars = ["blood_pressure", "cholestoral", "max_heart_rate", "age"]
     # cateVars = ["cp", "thal"]
     ct = compose.ColumnTransformer(
         [
             # ("norm", preprocessing.StandardScaler(), self._getIndex(numvars)),
             # (
             #     "cate",
             #     preprocessing.OneHotEncoder(handle_unknown="ignore"),
             #     self.getIndex(cateVars),
             # ),
         ],
         remainder="passthrough",
     )
     transformer = pipeline.Pipeline([("norm", ct)])
     transformer.set_params(**params)
     return transformer
Ejemplo n.º 11
0
def get_preprocess_pipeline(feature_columns, categorical_names,
                            numerical_names):
    """
    Creates the preprocessor used to process the data for training. 
    This will be combined with the estimator 
    
    Returns
    -------
       Preprocessor
    """

    numeric_transformer = pipeline.Pipeline([
        ('imputer', impute.SimpleImputer(strategy='median')),
        ('scaler', preprocessing.StandardScaler()),
    ])

    categorical_transformer = pipeline.Pipeline([
        ('onehot',
         preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ])

    feature_columns = metadata.FEATURE_COLUMNS
    numerical_names = metadata.NUMERIC_FEATURES
    categorical_names = metadata.CATEGORICAL_FEATURES

    boolean_mask = functools.partial(utils.boolean_mask, feature_columns)
    numerical_boolean = boolean_mask(numerical_names)
    categorical_boolean = boolean_mask(categorical_names)

    transform_list = []

    if any(numerical_boolean):
        transform_list.extend([
            ('numeric', numeric_transformer, numerical_boolean),
        ])

    if any(categorical_boolean):
        transform_list.extend([
            ('categorical', categorical_transformer, categorical_boolean),
        ])

    preprocessor = compose.ColumnTransformer(transform_list)

    return preprocessor
Ejemplo n.º 12
0
def _get_preprocessor(
    num_features: List[str], cat_features: List[str]
) -> pipeline.Pipeline:

    num_transformer = pipeline.Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ("impute", impute.KNNImputer(n_neighbors = 10)),
    ])

    cat_transformer = pipeline.Pipeline([
        ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")),
        ("encode", preprocessing.OneHotEncoder(drop = "first")),
    ] )

    preprocessor = compose.ColumnTransformer(
        [("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ])
    return preprocessor
Ejemplo n.º 13
0
def prepare_training_data(df_dict, target, remove, date_fields):
    df_dict_target = {}
    for df_name, df in df_dict.items():
        try:
            df_dict_target[df_name] = df[target]
        except KeyError:
            df_dict_target[df_name] = None
        df = remove_fields(add_date_features(df, date_col_list=date_fields),
                           target, remove)
        df['df_id'] = df_name
        df_dict[df_name] = df

    df_master = pd.concat(df_dict.values(), axis=0, sort=False)

    one_hot_list = []
    scaler_list = []
    for idx, col in enumerate(df_master.columns):
        d_type = type(df_master[col].iloc[0])
        if col not in ['df_id']:
            if d_type in [str]:
                df_master[col] = df_master[col].fillna('N/A')
                one_hot_list.append(idx)
            elif d_type not in [np.datetime64, pd.Timestamp]:
                df_master[col] = df_master[col].fillna(df_master[col].median())
                scaler_list.append(idx)

    for df_name, df in df_dict.items():
        df_dict[df_name] = df_master.loc[df_master['df_id'] == df_name].drop(
            'df_id', axis=1)

    df_master = df_master.drop('df_id', axis=1)

    ct = compose.ColumnTransformer(transformers=[
        ('one_hot_1', preprocessing.OneHotEncoder(sparse=False), one_hot_list),
        ('scaler_1', preprocessing.StandardScaler(), scaler_list)
    ],
                                   sparse_threshold=0).fit(df_master, y=target)

    for df_name, df in df_dict.items():
        df_dict[df_name] = xgb.DMatrix(data=ct.transform(df), missing=np.nan)
        if df_dict_target[df_name] is not None:
            df_dict[df_name].set_label(df_dict_target[df_name])
    return df_dict
Ejemplo n.º 14
0
 def __init__(self, num_features, cat_features):
     self.num_features = num_features
     self.cat_features = cat_features
     
     self.data = None
     self.fit_flag = False
 
     # pipeline for numeric features
     self.num_preprocessing = pipeline.Pipeline(steps = [
         ('num', impute.SimpleImputer(strategy = 'mean')), # strategy = 'constant', fill_value = 0
         ('num_scaler', preprocessing.StandardScaler())
     ])
     # pipeline for numeric features
     self.cat_preprocessing = pipeline.Pipeline(steps = [
         ('cat', impute.SimpleImputer(strategy = 'constant')), # 'most_frequent'
         ('cat_encoder', preprocessing.OneHotEncoder(handle_unknown = 'ignore', sparse = False))
     ])
 
     # transformer for impute NaN and preprocessing features
     self.data_preprocessing = compose.ColumnTransformer(transformers = [
         ('num_features', self.num_preprocessing, self.num_features),
         ('cat_features', self.cat_preprocessing, self.cat_features)
     ])
Ejemplo n.º 15
0
def create_pipeline(num_feat, cat_feat, cfg):
    """
    Create and return the model classification pipeline with encoding and imputation of feature and model

    :param      num_feat: list
              numerical features name list

    :param      cat_feat:  list
              categorical features name list

    :param      cfg: class
              custom configuration class

    :return:  sklearn.pipeline.Pipeline
            model pipeline
    """

    cat_pipeline = make_pipeline(
        impute.SimpleImputer(strategy='constant', fill_value='NaN'),
        preprocessing.OneHotEncoder(categories='auto',
                                    handle_unknown='ignore'))

    pre_process_pipeline = make_pipeline(
        transformers.ColumnSelector(columns=cfg.features),
        compose.ColumnTransformer(transformers=[
            ('num_feat',
             impute.SimpleImputer(strategy='constant',
                                  fill_value=cfg.num_imputer), num_feat),
            ('cat_feat', cat_pipeline, cat_feat),
        ]),
    )

    pipeline = Pipeline(
        steps=[('preproc', pre_process_pipeline
                ), ('xgb', xgb.XGBClassifier(objective='binary:logistic'))])

    return pipeline
dataset = openml.datasets.get_dataset(68)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute)
print(f"Categorical features: {categorical_indicator}")
transformer = compose.ColumnTransformer([
    ("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"),
     categorical_indicator)
])
X = transformer.fit_transform(X)
clf.fit(X, y)

############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.

# Get a task
task = openml.tasks.get_task(403)

# Build any classifier or pipeline
clf = tree.ExtraTreeClassifier()
Ejemplo n.º 17
0
                                                       y,
                                                       test_size=0.33,
                                                       random_state=0)

    # Cross-validation.
    k = 3
    cvsplitter = ms.KFold(n_splits=k, shuffle=True, random_state=0)

    # Apply a transformation for each column.
    transformers = list()
    transformers.append(('StandardScaler', pp.StandardScaler(), idxnumerics))
    transformers.append(
        ('OneHotEncoder',
         pp.OneHotEncoder(sparse=False, drop='first',
                          handle_unknown='ignore'), idxnonnumerics))
    ct = sc.ColumnTransformer(transformers, remainder='passthrough')
    ct.fit(Xtrain)
    Xtrain_transformed = ct.transform(Xtrain)
    print('Feature Names: {0}'.format(ct.get_feature_names_out()))

    # Use the transformer in a pipeline.
    estimators = list()
    estimators.append(('ColumnTransformer',
                       sc.ColumnTransformer(transformers,
                                            remainder='passthrough')))
    estimators.append(('RandomForestClassifier',
                       ensemble.RandomForestClassifier(n_estimators=100,
                                                       max_features=3)))
    ppl = pl.Pipeline(estimators)
    accuracy = ms.cross_val_score(ppl, Xtrain, ytrain, cv=cvsplitter)
    print('Accuracy of pipeline: {0:.2f}'.format(accuracy.mean()))
Ejemplo n.º 18
0
    "mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen"
]
glucose_attr = [
    "mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose"
]

vztahy_attr = ["relationship", "marital-status"]
work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"]
edu_attr = ["education", "education-num"]

impute_col_transf = compose.ColumnTransformer(transformers=[
    ("oxygen_n_glucose_impute",
     KeepDataFrame(impute.IterativeImputer(max_iter=50)),
     oxygen_attr + glucose_attr
     ), ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr),
    ("work_impute", CustomCatImputing(imputer_type="knn"),
     work_attr), ("edu_impute", CustomCatImputing(imputer_type="knn"),
                  edu_attr),
    ("sex_impute",
     KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")),
     ["sex"]), ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"])
])

#tento column transformer sa bude pouzivat v pripade, kedy chceme pouzit v ramci celeho datasetu cisto len simpleimputer
most_freq_attr = ["sex"] + edu_attr + work_attr + vztahy_attr
mean_attr = ["age"] + oxygen_attr + glucose_attr

simple_impute_col_transf = compose.ColumnTransformer(transformers=[(
    "simple_impute_cat",
    KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")),
    most_freq_attr
), ("simple_impute_num", KeepDataFrame(impute.SimpleImputer()), mean_attr)])
)
for rep in range(REPETITIONS):
    results = []
    for dataset in datasets:
        dataset_info = arff_io.loadarff(config["dataset"]["folder"] + dataset +
                                        ".arff")
        dataset_info = pd.DataFrame(dataset_info[0])
        target = dataset_info["class"].values
        # Data preprocessing (type transformation)
        if target.dtype == object:
            le.fit(target)
            target = le.transform(target)
        attrs_ = dataset_info.drop("class", axis=1)
        if np.any(attrs_.dtypes == object):
            ct = compose.ColumnTransformer(transformers=[
                ("encoder", enc, attrs_.dtypes == object)
            ],
                                           remainder="passthrough")
            attrs_ = ct.fit_transform(attrs_)
        try:
            attrs = attrs_.toarray()
        except AttributeError:
            attrs = np.array(attrs_)
        X_train, X_test, y_train, y_test = train_test_split(attrs,
                                                            target,
                                                            test_size=0.2)
        automl.fit(X_train, y_train, dataset_name=dataset)
        try:
            steps = automl.get_models_with_weights()[0][1].named_steps
            results.append({
                "dataset":
                dataset,
Ejemplo n.º 20
0
]
x_train = train_users[vars]
x_val = test_users[vars]
y_train = train_users['target'].map({"Bus":2, "Car":2, "Still": 4, "Train":1, "Walking": 3})
y_val = test_users['target'].map({"Bus":2, "Car":2, "Still": 4, "Train":1, "Walking": 3})

##########################################################################################################################################

## Strategy for missing data - fill with 0
num_pipe_tree = pipeline.Pipeline(steps=[
            ('imputer', impute.SimpleImputer(strategy="constant", fill_value=0)),#fill_value=0
            #('Scaler',StandardScaler()),
])
tree_pipe = compose.ColumnTransformer(transformers=[
            #('cats', cat_pipe, cat_vars1),
            ('nums0', num_pipe_tree, vars)],
            #('numsM', num_pipe_nanM, num_vars_nanM)],
            remainder='drop'
            )
tree_pipes = {model_name: pipeline.make_pipeline(tree_pipe, model) for model_name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], "Adjusted Acc":[],"Adjusted Bal Acc": [],  "grouped Acc":[], "Grouped Bal Acc":[], 'Time': []})

print(x_train.isna().sum())
for model_name, model in tree_pipes.items():
    print(f"Working on: {model_name}")
    print(model)
    start_time = time.time()
    model.fit(x_train, y_train)
    if model_name == "VotingClassifier":
        pickle.dump(model,open("SavedModels/2best_model.pickle", 'wb'))
    pred = model.predict(x_val)
categorical_pipeline = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy="most_frequent")),
    ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

#build pipeline for numerical features
numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()),
                                        ('scaler',
                                         preprocessing.StandardScaler())])

#build preprocessing pipeline for all features
cat_features = utils.get_non_continuous_features(house_train1)
num_features = utils.get_continuous_features(house_train1)

preprocess_pipeline = compose.ColumnTransformer([
    ('cat', categorical_pipeline, cat_features),
    ('num', numerical_pipeline, num_features)
])

#build complete pipeline with feature selection and ml algorithms
complete_pipeline = pipeline.Pipeline([
    ('preprocess', preprocess_pipeline),
    ('zv_filter', feature_selection.VarianceThreshold()),
    ('feature_selector',
     feature_selection.SelectFromModel(linear_model.Lasso())),
    ('pca', decomposition.PCA()),
    ('regressor', neighbors.KNeighborsRegressor())
])
pipeline_grid = {
    'preprocess__num__imputer__strategy': ['mean', 'median'],
    'pca__n_components': [0.90, 0.95],
    'regressor__n_neighbors': list(range(5, 15))
Ejemplo n.º 22
0
]
num_features = list(df.drop(cat_features + targets, axis=1).columns)

# The distribution of Y1 and Y2 are both normal. Split data randomly.
train, test = enrich.split_train_test_rand(df, 0.2, 123)

# Define pipelines
numeric_transformer = pipe.Pipeline(steps=[('scaler', skp.StandardScaler())])

categorical_transformer = pipe.Pipeline(
    steps=[('onehot',
            ce.OneHotEncoder(cols=cat_features, drop_invariant=True))])

# Create full transformation, including both pipelines
full_transformer = compose.ColumnTransformer([
    ("cat", categorical_transformer, cat_features),
    ("num", numeric_transformer, num_features)
])

# Prepare the data by fitting the full pipeline to the training data, and transforming it
# N.B. You must cast this back to DataFrame, because the return value is of type numpy array
oh_names = [
    "CatVar0_1", "CatVar0_2", "CatVar0_3", "CatVar0_4", "CatVar1_1",
    "CatVar1_2", "CatVar1_3", "CatVar1_4", "CatVar2_1", "CatVar2_2",
    "CatVar3_1", "CatVar3_2", "CatVar3_3", "CatVar4_1", "CatVar4_2",
    "CatVar4_3", "CatVar5_1", "CatVar5_2", "CatVar5_3", "CatVar5_4",
    "CatVar6_1", "CatVar6_2", "CatVar6_3", "CatVar6_4", "CatVar7_1",
    "CatVar7_2", "CatVar7_3"
]

# drop targets prior to transformation with pipeline
x_train = train.drop(targets, axis=1)
Ejemplo n.º 23
0
    Returns preprocessing pipeline adapted to specified numerical
    and categorical features
    """

    num_transformer = pipeline.Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ("impute", impute.KNNImputer(n_neighbors = 10)),
    ])

    cat_transformer = pipeline.Pipeline([
        ("impute", impute.SimpleImputer(strategy = "constant", fill_value = "missing")),
        ("encode", preprocessing.OneHotEncoder(drop = "first")),
    ] )

    preprocessor = compose.ColumnTransformer(
        [("num", num_transformer, num_features), 
        ("cat", cat_transformer, cat_features)
    ])
    return preprocessor

def get_lr_model(
    num_features: List[str], cat_features: List[str], C: float = 1.0
) -> pipeline.Pipeline:
    """
    Returns full pipeline for a logistic regression model with 
    specified numerical and categorical features.
    """

    model = pipeline.Pipeline([
        ("pre", _get_preprocessor(num_features, cat_features)),
        ("model", multioutput.MultiOutputClassifier(
                    linear_model.LogisticRegression(penalty="l1", C = C, solver = "saga")
Ejemplo n.º 24
0
    def fit(self):

        if self.fit_first:
            self.full_X = np.concatenate(self.collect_X,
                                         axis=0).astype(np.float32)
            self.full_y = np.concatenate(self.collect_y).astype(np.float32)

            self.collect_X = []
            self.collect_y = []
            gc.collect()

            self.full_X_val = np.concatenate(self.collect_X_val,
                                             axis=0).astype(np.float32)
            self.full_y_val = np.concatenate(self.collect_y_val).astype(
                np.float32)

            self.collect_X_val = []
            self.collect_y_val = []
            gc.collect()
            self.fit_first = False

            # In case we are MLP or LogReg transform the matrices
            if self.logreg_mode or self.mlp_mode:
                self.imputer_1 = skimpute.SimpleImputer(strategy="mean")
                self.imputer_2 = skimpute.SimpleImputer(strategy="mean")
                self.imputer_3 = skimpute.SimpleImputer(strategy="mean")
                self.imputer_cat_1 = skimpute.SimpleImputer(
                    strategy="most_frequent")
                self.imputer_cat_2 = skimpute.SimpleImputer(
                    strategy="most_frequent")
                self.scaler_1 = skpproc.StandardScaler()
                self.scaler_2 = skpproc.StandardScaler()
                self.scaler_3 = skpproc.StandardScaler()
                self.cat_encoder_1 = skpproc.OneHotEncoder(
                    categories="auto", sparse=False, handle_unknown="ignore")
                self.cat_encoder_2 = skpproc.OneHotEncoder(
                    categories="auto", sparse=False, handle_unknown="ignore")

                # Gather indices of the categorical columns
                cat_col_idxs = []
                print("Number of cat cols: {}".format(len(self.X_cat_cols)))
                for cat_colname in self.X_cat_cols:
                    cat_col_idxs.append(self.X_col_names.index(cat_colname))

                assert (self.full_X.shape[1] == 500)
                sorted_cat_cols = list(sorted(cat_col_idxs))

                if len(sorted_cat_cols) == 0:
                    self.composed_imputer = self.imputer_1
                    self.composed_scaler_encoder = self.scaler_1

                elif sorted_cat_cols[1] - sorted_cat_cols[0] == 1:
                    lidx = sorted_cat_cols[0]
                    ridx = sorted_cat_cols[1]
                    self.composed_imputer = skcompose.ColumnTransformer(
                        [("cont_impute_1", self.imputer_1, np.arange(lidx)),
                         ("cat_impute_1", self.imputer_cat_1, [lidx, ridx]),
                         ("cont_impute_2", self.imputer_2,
                          np.arange(ridx + 1, self.full_X.shape[1]))],
                        sparse_threshold=0)
                    self.composed_scaler_encoder = skcompose.ColumnTransformer(
                        [("cont_scale_1", self.scaler_1, np.arange(lidx)),
                         ("cat_encode_1", self.cat_encoder_1, [lidx, ridx]),
                         ("cont_scale_3", self.scaler_2,
                          np.arange(ridx + 1, self.full_X.shape[1]))],
                        sparse_threshold=0)

                else:
                    lidx = sorted_cat_cols[0]
                    ridx = sorted_cat_cols[1]
                    self.composed_imputer = skcompose.ColumnTransformer(
                        [("cont_impute_1", self.imputer_1, np.arange(lidx)),
                         ("cat_impute_1", self.imputer_cat_1, [lidx]),
                         ("cont_impute_2", self.imputer_2,
                          np.arange(lidx + 1, ridx)),
                         ("cat_impute_2", self.imputer_cat_2, [ridx]),
                         ("cont_impute_3", self.imputer_3,
                          np.arange(ridx + 1, self.full_X.shape[1]))],
                        sparse_threshold=0)

                    self.composed_scaler_encoder = skcompose.ColumnTransformer(
                        [("cont_scale_1", self.scaler_1, np.arange(lidx)),
                         ("cat_encode_1", self.cat_encoder_1, [lidx]),
                         ("cont_scale_2", self.scaler_2,
                          np.arange(lidx + 1, ridx)),
                         ("cat_encode_2", self.cat_encoder_2, [ridx]),
                         ("cont_scale_3", self.scaler_3,
                          np.arange(ridx + 1, self.full_X.shape[1]))],
                        sparse_threshold=0)

                self.full_X[~np.isfinite(self.full_X)] = np.nan
                self.full_X_val[~np.isfinite(self.full_X_val)] = np.nan
                self.full_X = self.composed_imputer.fit_transform(self.full_X)
                self.full_X = self.composed_scaler_encoder.fit_transform(
                    self.full_X)
                self.full_X_val = self.composed_imputer.transform(
                    self.full_X_val)
                self.full_X_val = self.composed_scaler_encoder.transform(
                    self.full_X_val)

        if self.verbose:
            print("Training matrix dimension: {}x{}".format(
                self.full_X.shape[0], self.full_X.shape[1]),
                  flush=True)
            print("Validation matrix dimension: {}x{}".format(
                self.full_X_val.shape[0], self.full_X_val.shape[1]),
                  flush=True)

        if self.univariate_test:
            Fstat, _ = skfselect.f_classif(self.full_X, self.full_y)
            sort_idx = list(np.argsort(Fstat))[::-1]

            with open("./features_F_scores.tsv", 'w') as fp:
                print("feat_name\tFscore", file=fp)
                for jdx in sort_idx:
                    print("{}\t{}".format(self.X_col_names[jdx], Fstat[jdx]),
                          file=fp)

            sys.exit(0)

        if self.select_features_forward:
            n_vars_to_select = 21
            selected_vars = []
            search_vars = [
                136, 146, 5, "RelDatetime", "Age", 1, 41, 42, 43, 44, 13, 28,
                172, 174, 176, 4, 62, 3, 20, 87, 23
            ]
            assert (len(search_vars) == n_vars_to_select)

            while len(selected_vars) < n_vars_to_select:
                best_score_round = -np.inf
                best_vid_round = None

                for idx, vid in enumerate(search_vars):
                    probe_set = selected_vars + [vid]
                    selected_idxs = self.derived_feature_set(probe_set)
                    der_X = self.full_X[:, selected_idxs]
                    der_X_val = self.full_X_val[:, selected_idxs]
                    derived_names = [
                        self.X_col_names[jdx] for jdx in selected_idxs
                    ]
                    derived_cat_cols = list(
                        set(derived_names).intersection(set(self.X_cat_cols)))
                    try:
                        self.ml_model.fit(der_X,
                                          self.full_y,
                                          feature_name=derived_names,
                                          eval_set=[(der_X_val,
                                                     self.full_y_val)],
                                          early_stopping_rounds=20,
                                          verbose=False,
                                          categorical_feature=derived_cat_cols,
                                          eval_metric=custom_auprc_metric)
                    except:
                        print("Degenerate variable set: Skipping...")
                        continue

                    metrics = self.get_validation_scores(
                        red_idxs=selected_idxs)
                    current_auprc = metrics["auprc"]

                    if current_auprc > best_score_round:
                        best_vid_round = vid
                        best_score_round = current_auprc

                selected_vars.append(best_vid_round)
                search_vars.remove(best_vid_round)
                print("Feature selection round {}/30 DONE".format(
                    len(selected_vars)))
                print("Added variable {}, New score AUPRC={:.3f}".format(
                    best_vid_round, best_score_round))

            print("Feature selection finalized...")
            sys.exit(0)

        if self.select_features_backward:
            search_vars = [
                136, 146, 60, 5, 41, 42, 43, 44, 39, 40, 45, 66, 12, 152, 20,
                72, 15, 64, 65, 160, 1, 168, 135, 61, 14, "PatGroup", "Age",
                "Height", "Surgical", "RelDatetime"
            ]
            selected_vars = search_vars.copy()

            while len(selected_vars) > 0:
                best_score_round = -np.inf
                best_vid_round = None

                for idx, vid in enumerate(selected_vars):
                    probe_set = selected_vars.copy()
                    probe_set.remove(vid)
                    selected_idxs = self.derived_feature_set(probe_set)
                    der_X = self.full_X[:, selected_idxs]
                    der_X_val = self.full_X_val[:, selected_idxs]
                    derived_names = [
                        self.X_col_names[jdx] for jdx in selected_idxs
                    ]
                    derived_cat_cols = list(
                        set(derived_names).intersection(set(self.X_cat_cols)))
                    try:
                        self.ml_model.fit(der_X,
                                          self.full_y,
                                          feature_name=derived_names,
                                          eval_set=[(der_X_val,
                                                     self.full_y_val)],
                                          early_stopping_rounds=10,
                                          verbose=False,
                                          categorical_feature=derived_cat_cols,
                                          eval_metric="auc")
                    except:
                        print("Degenerate variable set: Skipping...")
                        continue

                    metrics = self.get_validation_scores(
                        red_idxs=selected_idxs)
                    current_auprc = metrics["auprc"]

                    if current_auprc > best_score_round:
                        best_vid_round = vid
                        best_score_round = current_auprc

                selected_vars.remove(best_vid_round)
                print("Feature selection round {}/30 DONE".format(
                    len(search_vars) - len(selected_vars)))
                print("Removed variable {}, New score AUPRC={:.3f}".format(
                    best_vid_round, best_score_round))

            print("Feature selection finalized...")
            sys.exit(0)

        if not self.use_xgboost and not self.use_catboost and not self.decision_tree_mode and not self.logreg_mode and not self.mlp_mode:
            self.ml_model.set_params(**{"metric": 'None'})

        cat_idxs = []
        for cidx, feat_name in enumerate(self.X_col_names):
            if feat_name in self.X_cat_cols:
                cat_idxs.append(cidx)

        if self.use_xgboost:
            self.ml_model.fit(self.full_X,
                              self.full_y,
                              eval_set=[(self.full_X_val, self.full_y_val)],
                              eval_metric="logloss",
                              early_stopping_rounds=50,
                              verbose=False)
        elif self.use_catboost:
            catboost_X = pd.DataFrame(self.full_X, columns=self.X_col_names)
            catboost_Xval = pd.DataFrame(self.full_X_val,
                                         columns=self.X_col_names)
            for cat_col in self.X_cat_cols:
                catboost_X[cat_col] = catboost_X[cat_col].astype(str)
                catboost_Xval[cat_col] = catboost_Xval[cat_col].astype(str)
            self.ml_model.fit(catboost_X,
                              self.full_y,
                              eval_set=[(catboost_Xval, self.full_y_val)],
                              cat_features=cat_idxs,
                              silent=True,
                              early_stopping_rounds=50)
        else:

            if self.decision_tree_mode:
                self.ml_model.fit(self.full_X,
                                  self.full_y,
                                  feature_name=self.X_col_names,
                                  categorical_feature=self.X_cat_cols,
                                  eval_set=[(self.full_X_val, self.full_y_val)
                                            ],
                                  eval_metric=custom_auprc_metric,
                                  verbose=False)
            elif self.logreg_mode:
                self.ml_model.fit(self.full_X, self.full_y)
            elif self.mlp_mode:
                self.ml_model.fit(self.full_X, self.full_y)
            else:
                self.ml_model.fit(self.full_X,
                                  self.full_y,
                                  feature_name=self.X_col_names,
                                  categorical_feature=self.X_cat_cols,
                                  eval_set=[(self.full_X_val, self.full_y_val)
                                            ],
                                  eval_metric=custom_auprc_metric,
                                  early_stopping_rounds=50,
                                  verbose=False)
Ejemplo n.º 25
0
cat_mult_pipeline = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('One Hot', OneHotEncoder(handle_unknown="ignore")),
    ])

num_mult_pipeline = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='median', )),
    #('Scaler',StandardScaler()),
    #('Quantile Transform',QuantileTransformer(n_quantiles=100, output_distribution='normal')),
    ('Yeo-Johnson', PowerTransformer(method='yeo-johnson')),
    #('Box-Cox', PowerTransformer(method='box-cox')), # all values must be greater than 0
    #('Scaler',StandardScaler),
    ])

mult_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_mult_pipeline, num_vars),
    ('cat', cat_mult_pipeline, cat_vars),
    ], remainder='drop')

#### PIPELINES FOR TREES
num_tree_pipeline = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='mean')),
    ])

cat_tree_pipeline =pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant',fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)),
    ])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_tree_pipeline, num_vars),
    ('cat', cat_tree_pipeline, cat_vars),
Ejemplo n.º 26
0
# Build pipelines for categorical data and numeric data
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant',
                                     fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))
])

num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('poly', preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', preprocessing.StandardScaler()),
])

preprocessing_pipe = compose.ColumnTransformer([("cat", cat_pipe, cat_columns),
                                                ("num", num_pipe, num_columns)
                                                ])

# Build estimator pipeline
estimator_pipe = pipeline.Pipeline([("preprocessing", preprocessing_pipe),
                                    ("est",
                                     linear_model.ElasticNet(random_state=1))])

# Parameter grid to tune hyper parameters
param_grid = {
    "est__alpha": 0.0 + np.random.random(10) * 0.02,
    "est__l1_ratio": np.linspace(0.0001, 1, 20),
}

# Grid Search estimator
gsearch = model_selection.GridSearchCV(estimator_pipe,
Ejemplo n.º 27
0
    lencoder.fit(X_train[i])
    X_train[i] = lencoder.transform(X_train[i])

cat_feature_pipeline = pipeline.Pipeline([
    ('imputation', impute.SimpleImputer(strategy="most_frequent")),
    #('label',preprocessing.LabelEncoder())
])
#transformed_data=cat_feature_pipeline.fit_transform(X_train[['ENRL_CERT_NBR']])
num_feature_pipeline = pipeline.Pipeline([
    ('imputation', impute.SimpleImputer()),
    ('standardscalar', preprocessing.StandardScaler())
])

#transformed_data=num_feature_pipeline.fit_transform(X_train[['TOT_BLNG_AMT']])
feature_preprocessing = compose.ColumnTransformer(
    [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list),
     ('num_feature_pipeline', num_feature_pipeline, num_features_list)],
    n_jobs=10)

features_pipeline = pipeline.FeatureUnion(
    [('pca_selector', decomposition.PCA(n_components=0.90)),
     ('et_selector',
      feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))],
    n_jobs=20)

classifier = tree.DecisionTreeClassifier()
#build complete pipeline with feature selection and ml algorithms
complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing),
                                  ('zv_filter',
                                   feature_selection.VarianceThreshold()),
                                  ('features', features_pipeline),
                                  ('tree', classifier)])
Ejemplo n.º 28
0
def factorization_machine_column_transformer(columns):
    return compose.ColumnTransformer([
        (col, preprocessing.OneHotEncoder(), [col]) for col in columns
    ])
Ejemplo n.º 29
0
    def preprocess(self):
        """
        Preprocesses the data according to specified demands and for the classifiers
        :return: None
        """
        # Display current operation
        # print(" Reading csv, dropping excluded columns, movie duplicates and rows with na values...")

        # import csv
        data = pd.read_csv(self.file, delimiter=',')

        # save all Attributes excluding content_Rating, movie_imdb_link, plot_keywords
        data.drop(
            columns=['content_rating', 'movie_imdb_link', 'plot_keywords'],
            inplace=True)

        # discard entries with any NaN value
        data.dropna(inplace=True)

        #  Handle duplicate movie_tile values
        data.drop_duplicates(subset='movie_title', keep='first', inplace=True)

        # As movie title is now unique we can discard it
        data.drop(columns=['movie_title'], inplace=True)

        # Utilize the fact that data is not normally distributed
        data['index1'] = data.index

        # saves imdb score as labels & Discard label from data
        self.y = data.pop('imdb_score')

        # Display current operation
        # print(" Turning genres column and the 3 actors to dummy variables...")

        # Turn into dummy variables and discard original column from data
        genres = data.pop('genres').str.get_dummies()

        # Merge the 3 actors into one column & delete original columns from data & Turn into dummy variables
        actors = (data.pop('actor_1_name') + "|" + data.pop('actor_2_name') +
                  "|" + data.pop('actor_3_name')).str.get_dummies()

        # Create column lists for transformer
        numerical_cols = data.select_dtypes(include='number').columns
        category_cols = data.select_dtypes(exclude='number').columns

        # Convert numerical columns int64 to float64
        data[numerical_cols] = data[numerical_cols].astype('float64')

        # After creating the column lists - joins back the dummy-variable actors and genres
        data = data.join(actors)
        data = data.join(genres)

        # Display current operation
        # print(" Applying Standard Scaler to numerical columns and OneHotEncoder for remaining categorical columns...")

        preprocessor = compose.ColumnTransformer(transformers=[
            ('num', preprocessing.StandardScaler(), numerical_cols),
            ('cat', preprocessing.OneHotEncoder(), category_cols)
        ],
                                                 remainder="passthrough")

        self.X = preprocessor.fit_transform(data)

        # Display current operation
        # print(" Binarizing Labels...")

        # all labels lower that 7 become 0, 7 and higher become 1
        self.y = preprocessing.Binarizer(GOODMOVIETHRESHOLD).fit_transform(
            self.y.to_numpy().reshape(-1, 1))
        self.y = np.ravel(self.y)
Ejemplo n.º 30
0
print('pipeline start')

train_file_a = os.path.join(str(get_project_root()), "experiments",
                            "user_interviews", "adult_simple_train_a.csv")
raw_data_a = pd.read_csv(train_file_a, na_values='?', index_col=0)

train_file_b = os.path.join(str(get_project_root()), "experiments",
                            "user_interviews", "adult_simple_train_b.csv")
raw_data_b = pd.read_csv(train_file_b, na_values='?', index_col=0)

merged_raw_data = raw_data_a.merge(raw_data_b, on="id")

data = merged_raw_data.dropna()

labels = preprocessing.label_binarize(data['income-per-year'],
                                      classes=['>50K', '<=50K'])

column_transformer = compose.ColumnTransformer(
    transformers=[('categorical',
                   preprocessing.OneHotEncoder(handle_unknown='ignore'),
                   ['education', 'workclass']),
                  ('numeric', preprocessing.StandardScaler(),
                   ['age', 'hours-per-week'])])
adult_income_pipeline = pipeline.Pipeline([('features', column_transformer),
                                           ('classifier',
                                            tree.DecisionTreeClassifier())])

adult_income_pipeline.fit(data, labels)
print('pipeline finished')