Exemple #1
0
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

from classification.decision_trees.decision_trees_functions import *
from utils import load_pd

data: pd.DataFrame
y: np.ndarray
X: np.ndarray

data, X, y = load_pd('../../../datasets/pd_speech_features.csv',
                     merge_observations=True)

# SMOTE balancing
RANDOM_STATE = 42
smote = SMOTE(ratio='minority', random_state=RANDOM_STATE)
X, y = smote.fit_sample(X, y)

skf = StratifiedKFold(5)

score_names = ['accuracy', 'recall', 'precision', 'roc-auc']
score = {
    'tree_depths': [],
    'tree_leafs': [],
    'accuracy': [],
    'recall': [],
    'precision': [],
    'roc-auc': []
}
for train_index, test_index in skf.split(X, y):
Exemple #2
0
from numpy.ma import arange

from data_exploration.singular_variable_analysis.singular_variable_analysis_functions import *
from utils import load_pd
from vis_functions import variables_boxplot

data: pd.DataFrame = load_pd("../../../datasets/pd_speech_features.csv",
                             pop_class=False)

print_shape(data)
print_variable_types(data)
print_missing_values(data)

class_values = [0, 1]

class_balance(data, 'class', class_values)

data, X, y = remove_corr_and_select_k_best(data)

variables_boxplot(data)
plt.show()

data['class'] = y
data_0 = data.loc[data['class'] == 0]
data_1 = data.loc[data['class'] == 1]
data_0.pop('class')
data_1.pop('class')

columns = data.columns
datas = [data_0, data_1]
Exemple #3
0
"""
K-Means results Parkinson Decease Data set.

Tested pre-processing: normalization (StandardScaler),
outlier removing with DBSCAN, PCA for data reduction/transformation
"""

from sklearn.preprocessing import StandardScaler

from clustering.clustering_functions import *
from utils import dbscan_outliers_analysis_plot, pca_cumulative_variance_plot, load_pd

# load data
data, X, y = load_pd("../../datasets/pd_speech_features.csv")

# pre-process data
normalized_data = StandardScaler().fit_transform(X)

dbscan_outliers_analysis_plot(normalized_data, eps_list=[15, 20, 25, 30, 35], min_samples=3)
non_outliers_indexes = DBSCAN(eps=35, min_samples=3).fit(normalized_data).labels_ != -1
data_without_out = normalized_data[non_outliers_indexes, :]
new_target = y[non_outliers_indexes]

pca_obj = pca_cumulative_variance_plot(data_without_out)
first_components = pca_obj.components_[:115]  # aprox 90% variance ratio
reduced_data = np.dot(data_without_out, first_components.T)

# parameter tuning
k_analysis(reduced_data, list(range(2, 20, 2)))

# fixed kmeans evaluation
Exemple #4
0
"""
Association rules for Parkinson Decease Data Set.
"""

from sklearn.feature_selection import f_classif

from pattern_mining.pattern_mining_functions import *
from utils import print_return_variable, load_pd

# load data
data, X, y = load_pd("../../datasets/pd_speech_features.csv",
                     remove_corr=True,
                     corr_threshold=.8)

# pattern mining parameters
print("\n### Pattern mining parameters")
k_features = print_return_variable("Number of features to select: ", 25)
selection_measure = print_return_variable("Feature selection function: ",
                                          f_classif)
discretize_function = print_return_variable("Discretize function: ", pd.cut)
bins = print_return_variable("Number of bins: ", 7)
disc_needless_cols = print_return_variable(
    "Variables that doesnt need to be discretize: ", ['gender', 'class'])
binary_cols = print_return_variable("Binary cols: ", [])
min_supp = print_return_variable("Min support: ", 0.6)
fp_mining_args = [min_supp]
min_conf = print_return_variable("Min confidence: ", 0.9)
min_ant_items = print_return_variable("Min of items in antecedents itemset: ",
                                      2)

# extract association rules