Beispiel #1
0
def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type):
    pred_y, true_y = [], []
    for train_index, test_index in cv_method.split(data_x):
        train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index]
        train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index]
        clf = createPipeline("DT", oversampler_type)
        clf.fit(train_x, train_y.values.ravel())
        pred_y = pred_y + clf.predict(test_x).ravel().tolist()
        pred_y_prob = pred_y
        true_y = true_y + test_y.values.ravel().tolist()
    return getMetrics(pred_y, pred_y_prob, true_y)
def baselineMetricsOfDTWithNonsensorFeatures(cv_method, data_x, data_y):
    pred_y, true_y = [], []
    for train_index, test_index in cv_method.split(data_x):
        train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index]
        train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index]
        unique_y = pd.unique(train_y["target"])
        if len(unique_y) == 1:
            pred_y = pred_y + (unique_y * test_y.shape[0]).tolist()
        else:
            if min(train_y["target"].value_counts()) >= 6:
                oversampler_type = "SVMSMOTE"
            else:
                oversampler_type = "RandomOverSampler"
            clf = createPipeline("DT", oversampler_type)
            clf.fit(train_x, train_y.values.ravel())
            pred_y = pred_y + clf.predict(test_x).ravel().tolist()
        pred_y_proba = pred_y
        true_y = true_y + test_y.values.ravel().tolist()
    return getMetrics(pred_y, pred_y_proba, true_y)
Beispiel #3
0
# for categorical features: calculate variance across all days
# for numerical features: calculate mode across all days
if summarised == "summarised":
    features = summariseFeatures(features, numerical_operators,
                                 categorical_operators, cols_var_threshold)

categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(
    categorical_operators, features)

data = pd.concat([features, demographic_features, targets],
                 axis=1,
                 join="inner")
data_x, data_y = data.drop("target", axis=1), data[["target"]]

# Step 3. Create pipeline
pipeline = createPipeline(model)
cv_class = globals()[cv_method]
inner_cv = cv_class()
outer_cv = cv_class()

# Step 4. Nested cross validation
fold_id, pid, best_params, true_y, pred_y, pred_y_prob = [], [], [], [], [], []
feature_importances_all_folds = pd.DataFrame()
fold_count = 1

# Outer cross validation
for train_index, test_index in outer_cv.split(data_x):

    # Split train and test, numerical and categorical features
    train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index]
    train_numerical_features, train_categorical_features = splitNumericalCategoricalFeatures(
    targets_value_counts = train_y["target"].value_counts()
    if len(targets_value_counts) < 2 or max(targets_value_counts) < 5:
        notes = open(snakemake.log[0], mode="w")
        notes.write(targets_value_counts.to_string())
        notes.close()
        break

    # Inner cross validation
    # Feature selection: mutual information
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    feature_selector = SelectKBest(mutual_info_classif, k=75)

    if min(targets_value_counts) >= 6:
        # SMOTE requires n_neighbors <= n_samples, the default value of n_neighbors is 6
        #clf = GridSearchCV(estimator=createPipeline(model, "SVMSMOTE", feature_selector=feature_selector), param_grid=model_hyperparams, cv=inner_cv, scoring="roc_auc", refit=True)
        clf = RandomizedSearchCV(estimator=createPipeline(
            model, "SVMSMOTE", feature_selector=feature_selector),
                                 param_distributions=model_hyperparams,
                                 cv=inner_cv,
                                 scoring="roc_auc",
                                 refit=True,
                                 random_state=0,
                                 n_iter=3)
    else:
        # RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement.
        #clf = GridSearchCV(estimator=createPipeline(model, "RandomOverSampler", feature_selector=feature_selector), param_grid=model_hyperparams, cv=inner_cv, scoring="roc_auc", refit=True)
        clf = RandomizedSearchCV(estimator=createPipeline(
            model, "RandomOverSampler", feature_selector=feature_selector),
                                 param_distributions=model_hyperparams,
                                 cv=inner_cv,
                                 scoring="roc_auc",
                                 refit=True,
Beispiel #5
0
    # values do not change between folds
    if fold_count == 1:
        num_of_rows = train_x.shape[0] + test_x.shape[0]
        num_of_features = train_x.shape[1]

    targets_value_counts = train_y["target"].value_counts()
    if len(targets_value_counts) < 2 or max(targets_value_counts) < 5:
        notes = open(snakemake.log[0], mode="w")
        notes.write(targets_value_counts.to_string())
        notes.close()
        break

    # Inner cross validation
    if min(targets_value_counts) >= 6:
        # SMOTE requires n_neighbors <= n_samples, the default value of n_neighbors is 6
        clf = GridSearchCV(estimator=createPipeline(model, "SMOTE"),
                           param_grid=model_hyperparams,
                           cv=inner_cv,
                           scoring="f1_macro")
    else:
        # RandomOverSampler: over-sample the minority class(es) by picking samples at random with replacement.
        clf = GridSearchCV(estimator=createPipeline(model,
                                                    "RandomOverSampler"),
                           param_grid=model_hyperparams,
                           cv=inner_cv,
                           scoring="f1_macro")
    clf.fit(train_x, train_y.values.ravel())

    # Collect results and parameters
    best_params = best_params + [clf.best_params_]
    cur_fold_pred = clf.predict(test_x).tolist()