Python prepare_dataset Examples, data_preparation_functions.prepare_dataset Python Examples

Example #1

0

Show file

File: HFCR_feature_selection.py Project: isabelSoares/CD-2021

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

graphsDir = './Results/FeatureSelection/'
if not os.path.exists(graphsDir):
    os.makedirs(graphsDir)

features_file = open(graphsDir + 'HFCR Feature Selection - Features', 'w')

data: pd.DataFrame = pd.read_csv('../Dataset/heart_failure_clinical_records_dataset.csv')
datas = prepfunctions.prepare_dataset(data, 'DEATH_EVENT', False, False)
for key, value in datas.items():
    print("Key: ", key)
    dataframe_rec = value.copy()
    subDir = graphsDir + key + '/'
    if not os.path.exists(subDir):
        os.makedirs(subDir)

    data = dataframe_rec.copy()
    y = data.pop('DEATH_EVENT')
    print('Original')
    labels = ['Original']
    values = [data.shape[1]]

    print('VarianceThreshold')
    data = dataframe_rec.copy()

Example #2

0

Show file

File: QOT_gradientBoosting.py Project: isabelSoares/CD-2021

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from datetime import datetime

graphsDir = './Results/GradientBoosting/'
if not os.path.exists(graphsDir):
    os.makedirs(graphsDir)

data: pd.DataFrame = pd.read_csv('../Dataset/qsar_oral_toxicity.csv',
                                 sep=';',
                                 header=None)
train, test = train_test_split(data,
                               train_size=0.7,
                               stratify=data[1024].values)
testDatas = {}
datas = prepfunctions.prepare_dataset(train, 1024, False, False)
for key in datas:
    testDatas[key] = test.copy()

featured_datas = prepfunctions.mask_feature_selection(
    datas, 1024, True,
    './Results/FeatureSelection/QOT Feature Selection - Features')
featured_test_datas = prepfunctions.mask_feature_selection(
    testDatas, 1024, True,
    './Results/FeatureSelection/QOT Feature Selection - Features')

best_accuracies = {
    "Original": [0.9992055926278995, 0.9458858413639734],
    "Original with FS": [0.9817286304416905, 0.9414381022979985],
    "UnderSample": [1.0, 0.9347664936990363],
    "UnderSample with FS": [1.0, 0.933283914010378],

Example #3

0

Show file

File: HFCR_new_clustering.py Project: isabelSoares/CD-2021

print('---------------------------')
print('-                         -')
print('-     HFCR Clustering     -')
print('-                         -')
print('---------------------------')

data: pd.DataFrame = pd.read_csv(
    '../Dataset/heart_failure_clinical_records_dataset.csv')

# Original
original_data = data.copy()
original_data.pop('DEATH_EVENT')

# Scaled
data_scaled = prepfunctions.prepare_dataset(data, 'DEATH_EVENT', True,
                                            True)['Original']
data_scaled.pop('DEATH_EVENT')

scaling_pca = [(False, False), (True, False), (False, True), (True, True)]

N_CLUSTERS = [2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
EPS = [2.5, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

fig, ax = plt.subplots(2, 3, figsize=(3 * 3, 4 * 2), squeeze=False)
fig_values_1 = {}
fig_values_2 = {}
fig_values_3 = {}
fig_values_4 = {}

fig_values_5 = {}
fig_values_6 = {}

Example #4

0

Show file

File: HFCR_random_forests.py Project: isabelSoares/CD-2021

import ds_functions as ds
import os

graphsDir = './Results/Random Forests/'
if not os.path.exists(graphsDir):
    os.makedirs(graphsDir)

print('-------------------------------')
print('-                             -')
print('-     HFCR Random Forests     -')
print('-                             -')
print('-------------------------------')

data: pd.DataFrame = pd.read_csv(
    '../Dataset/heart_failure_clinical_records_dataset.csv')
datas = prepfunctions.prepare_dataset(data, 'DEATH_EVENT', True, True)
featured_datas = prepfunctions.mask_feature_selection(
    datas, 'DEATH_EVENT', False,
    './Results/FeatureSelection/HFCR Feature Selection - Features')
best_accuracies = {}

for key in datas:
    for do_feature_eng in [False, True]:
        if (do_feature_eng):
            data = featured_datas[key]
            subDir = graphsDir + 'FeatureEng/' + key + '/'
            if not os.path.exists(subDir):
                os.makedirs(subDir)
        else:
            data = datas[key]
            subDir = graphsDir + key + '/'

Example #5

0

Show file

graphsDir = './Results/Random Forests/'
if not os.path.exists(graphsDir):
    os.makedirs(graphsDir)


print('-------------------------------')
print('-                             -')
print('-     HFCR Random Forests     -')
print('-                             -')
print('-------------------------------')



data: pd.DataFrame = pd.read_csv('../../Dataset/heart_failure_clinical_records_dataset.csv')
datas = prepfunctions.prepare_dataset(data.copy(), 'DEATH_EVENT', False, False)

datas_outliers = prepfunctions.prepare_dataset(data.copy(), 'DEATH_EVENT', False, True)
datas_outliers_scaling = prepfunctions.prepare_dataset(data.copy(), 'DEATH_EVENT', True, True)
datas_outliers_featureselection = prepfunctions.mask_feature_selection(datas_outliers.copy(), 'DEATH_EVENT', False, './Results/FeatureSelection/HFCR Feature Selection - Features')
datas_outliers_scaling_featureselection = prepfunctions.mask_feature_selection(datas_outliers_scaling.copy(), 'DEATH_EVENT', False, './Results/FeatureSelection/HFCR Feature Selection - Features')

datas_scaling = prepfunctions.prepare_dataset(data.copy(), 'DEATH_EVENT', True, False)
datas_scaling_featureselection = prepfunctions.mask_feature_selection(datas_scaling.copy(), 'DEATH_EVENT', False, './Results/FeatureSelection/HFCR Feature Selection - Features')

datas_featureselection = prepfunctions.mask_feature_selection(datas.copy(), 'DEATH_EVENT', False, './Results/FeatureSelection/HFCR Feature Selection - Features')

all_datas = [datas, datas_outliers, datas_scaling, datas_featureselection, datas_outliers_scaling, datas_outliers_featureselection, datas_outliers_scaling_featureselection]
all_datas_names = ['', ' - No Outliers', ' - Scaling', ' - Feature Selection', ' - No Outliers & Scaling', ' - No Outliers & Feature Selection', ' - No Outliers, Scaling & Feature Selection']
provisorio_data_scaling = ' - Scaling & Feature Selection'

Example #6

0

Show file

graphsDir = './Results/Log Regression/'
if not os.path.exists(graphsDir):
    os.makedirs(graphsDir)

print('--------------------------------------')
print('-                                    -')
print('-    QOT Log Regression - Treated    -')
print('-                                    -')
print('--------------------------------------')

RANDOM_STATE = 42
data: pd.DataFrame = pd.read_csv('../Dataset/qsar_oral_toxicity.csv',
                                 sep=';',
                                 header=None)
datas = prepfunctions.prepare_dataset(data, 1024, False, False)
featured_datas = prepfunctions.mask_feature_selection(
    datas, 1024, True,
    './Results/FeatureSelection/QOT Feature Selection - Features')
best_accuracies = {}

for key in datas:
    for do_feature_eng in [False, True]:
        if (do_feature_eng):
            data = featured_datas[key]
            subDir = graphsDir + 'FeatureEng/' + key + '/'
            if not os.path.exists(subDir):
                os.makedirs(subDir)
        else:
            data = datas[key]
            subDir = graphsDir + key + '/'