Ejemplo n.º 1
0
def baselineMetricsOfDTWithDemographicFeatures(cv_method, data_x, data_y, oversampler_type):
    pred_y, true_y = [], []
    for train_index, test_index in cv_method.split(data_x):
        train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index]
        train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index]
        clf = createPipeline("DT", oversampler_type)
        clf.fit(train_x, train_y.values.ravel())
        pred_y = pred_y + clf.predict(test_x).ravel().tolist()
        pred_y_prob = pred_y
        true_y = true_y + test_y.values.ravel().tolist()
    return getMetrics(pred_y, pred_y_prob, true_y)
Ejemplo n.º 2
0
def baselineMetricsOfRandomWeightedClassifier(targets, majority_ratio, majority_class, iter_times):
    metrics_all_iters = {"accuracy": [], "precision0":[], "recall0": [], "f10": [], "precision1": [], "recall1": [], "f11": [], "f1_macro": [], "auc": []}
    probabilities = [0, 0]
    probabilities[majority_class], probabilities[1 - majority_class] = majority_ratio, 1 - majority_ratio
    for i in range(iter_times):
        pred_y = np.random.RandomState(i).multinomial(1, probabilities, targets.shape[0])[:,1].tolist()
        pred_y_proba = pred_y
        metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist())
        for key in metrics_all_iters.keys():
            metrics_all_iters[key].append(metrics[key].item())
    # Calculate average metrics across all iterations
    avg_metrics = {}
    for key in metrics_all_iters.keys():
        avg_metrics[key] = mean(metrics_all_iters[key])
    return avg_metrics
Ejemplo n.º 3
0
def baselineMetricsOfDTWithNonsensorFeatures(cv_method, data_x, data_y):
    pred_y, true_y = [], []
    for train_index, test_index in cv_method.split(data_x):
        train_x, test_x = data_x.iloc[train_index], data_x.iloc[test_index]
        train_y, test_y = data_y.iloc[train_index], data_y.iloc[test_index]
        unique_y = pd.unique(train_y["target"])
        if len(unique_y) == 1:
            pred_y = pred_y + (unique_y * test_y.shape[0]).tolist()
        else:
            if min(train_y["target"].value_counts()) >= 6:
                oversampler_type = "SVMSMOTE"
            else:
                oversampler_type = "RandomOverSampler"
            clf = createPipeline("DT", oversampler_type)
            clf.fit(train_x, train_y.values.ravel())
            pred_y = pred_y + clf.predict(test_x).ravel().tolist()
        pred_y_proba = pred_y
        true_y = true_y + test_y.values.ravel().tolist()
    return getMetrics(pred_y, pred_y_proba, true_y)
Ejemplo n.º 4
0
    pred_y = pred_y + clf.predict(test_x).tolist()
    pred_y_prob = pred_y_prob + clf.predict_proba(test_x)[:, 1].tolist()
    true_y = true_y + test_y.values.ravel().tolist()
    pid = pid + test_y.index.tolist(
    )  # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv)
    feature_importances_current_fold = getFeatureImportances(
        model, clf.best_estimator_.steps[1][1], train_x.columns)
    feature_importances_all_folds = pd.concat(
        [feature_importances_all_folds, feature_importances_current_fold],
        sort=False,
        axis=0)
    fold_id.append(fold_count)
    fold_count = fold_count + 1

# Step 5. Model evaluation
acc, pre1, recall1, f11, auc, kappa = getMetrics(pred_y, pred_y_prob, true_y)

# Step 6. Save results, parameters, and metrics to CSV files
fold_predictions = pd.DataFrame({
    "fold_id": fold_id,
    "pid": pid,
    "hyperparameters": best_params,
    "true_y": true_y,
    "pred_y": pred_y,
    "pred_y_prob": pred_y_prob
})
fold_metrics = pd.DataFrame({
    "fold_id": [],
    "accuracy": [],
    "precision1": [],
    "recall1": [],
Ejemplo n.º 5
0
def baselineAccuracyOfMajorityClassClassifier(targets):
    majority_class = targets["target"].value_counts().idxmax()
    pred_y = [majority_class] * targets.shape[0]
    pred_y_proba = pred_y
    metrics = getMetrics(pred_y, pred_y_proba, targets["target"].values.ravel().tolist())
    return metrics, majority_class
Ejemplo n.º 6
0
    proba_of_two_categories = clf.predict_proba(test_x).tolist()
    cur_fold_pred_proba = [
        probabilities[clf.classes_.tolist().index(1)]
        for probabilities in proba_of_two_categories
    ]
    pred_y_proba = pred_y_proba + cur_fold_pred_proba

    true_y = true_y + test_y.values.ravel().tolist()
    pid = pid + test_y.index.get_level_values("pid").tolist()
    local_date = local_date + test_y.index.get_level_values(
        "local_date").tolist()
    fold_id.extend([fold_count] * test_x.shape[0])
    fold_count = fold_count + 1

# Step 3. Model evaluation
metrics = getMetrics(pred_y, pred_y_proba, true_y)

# Step 4. Save results, parameters, and metrics to CSV files
fold_predictions = pd.DataFrame({
    "fold_id": fold_id,
    "pid": pid,
    "local_date": local_date,
    "hyperparameters": best_params,
    "true_y": true_y,
    "pred_y": pred_y,
    "pred_y_proba": pred_y_proba
})
overall_results = pd.DataFrame({
    "accuracy": [metrics["accuracy"]],
    "precision0": [metrics["precision0"]],
    "recall0": [metrics["recall0"]],
Ejemplo n.º 7
0
    true_y = true_y + test_y.values.ravel().tolist()
    pid = pid + test_y.index.tolist(
    )  # each test partition (fold) in the outer cv is a participant (LeaveOneOut cv)
    feature_importances_current_fold = getFeatureImportances(
        model, clf.best_estimator_.steps[1][1], train_x.columns)
    feature_importances_all_folds = pd.concat(
        [feature_importances_all_folds, feature_importances_current_fold],
        sort=False,
        axis=0)
    fold_id.append(fold_count)
    fold_count = fold_count + 1

# Step 3. Model evaluation
if len(pred_y) > 1:
    metrics = getMetrics(pred_y, pred_y_prob, true_y)
else:
    metrics = {
        "accuracy": None,
        "precision0": None,
        "recall0": None,
        "f10": None,
        "precision1": None,
        "recall1": None,
        "f11": None,
        "auc": None,
        "kappa": None
    }

# Step 4. Save results, parameters, and metrics to CSV files
fold_predictions = pd.DataFrame({