def read_train_test_files(): train_arff_files = glob.glob( '../datasets/datasetsCBR/sick/*.train.arff') # for sick dataset # train_arff_files = glob.glob('../datasets/datasetsCBR/sick/*.train.arff') # for bal dataset # test_arff_files = glob.glob('../datasets/datasetsCBR/bal/*.test.arff') # for bal dataset test_arff_files = glob.glob( '../datasets/datasetsCBR/sick/*.test.arff') # for sick dataset train_test_split = [] for train_file, test_file in zip(train_arff_files, test_arff_files): # Train df_train = eda.read_arff(path_data=train_file, url_data=None) X_num_train, X_cat_train, y_train, encoder_train = all_steps.clean_sick( df_train) X_train = prep.join_features(X_num_train, X_cat_train) # Test df_test = eda.read_arff(path_data=test_file, url_data=None) X_num_test, X_cat_test, y_test, encoder_test = all_steps.clean_sick( df_train, encoder_train) X_test = prep.join_features(X_num_test, X_cat_test) train_test_split.append( (X_train.values, y_train.values.reshape(-1, ), X_test.values, y_test.values.reshape(-1, ))) return train_test_split
def read_train_test_files(fold_number): import glob train_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.train.arff') test_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.test.arff') # test_arff_files = glob.glob('datasetsCBR/pen-based/*.test.arff') TrainTotal = [] Y_TrainTotal = [] TestTotal = [] Y_TestTotal = [] for file in train_arff_files: df_train = eda.read_arff(path_data=file, url_data=None) splits, metadata = eda.split(df_train, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features y_train = splits['y']['a17'].values X_norm_train = (X_num - X_num.min()) / (X_num.max() - X_num.min()) TrainTotal.append(X_norm_train) Y_TrainTotal.append(y_train) for file in test_arff_files: df_test = eda.read_arff(path_data=file, url_data=None) splits, metadata = eda.split(df_test, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features y_test = splits['y']['a17'].values X_norm_test = (X_num - X_num.min()) / (X_num.max() - X_num.min()) TestTotal.append(X_norm_test) Y_TestTotal.append(y_test) return TrainTotal[fold_number-1],Y_TrainTotal[fold_number-1], TestTotal[fold_number-1], Y_TestTotal[fold_number-1]
import pandas as pd import numpy as np from matplotlib import pyplot as plt from scipy.io import arff from scipy import stats import tools.eda as eda import tools.preprocess as prep from sklearn.metrics import adjusted_mutual_info_score, mutual_info_score, silhouette_score, adjusted_rand_score, f1_score, davies_bouldin_score, confusion_matrix, accuracy_score path = 'datasets/splice.arff' # Parse into pandas DataFrame df = eda.read_arff(path) df_original = df.copy() df.head() # Dataset shape # # In[3]: target = 'Class' y = df_original[target] print('Num rows:', len(df)) print('Num cols:', len(df.columns))
from io import StringIO from sklearn.preprocessing import LabelEncoder from tools import eda from tools import preprocess as prep # ### Read an example of the Pen-Based data set # In[2]: path = '../datasets/datasetsCBR/pen-based/pen-based.fold.000000.test.arff' # Read the data set df_test = eda.read_arff(path_data=path, url_data=None) df_test.head() # In[3]: splits, metadata = eda.split(df_test, cat_features=None,response='a17') X_num = splits['X_num'] X_cat = splits['X_cat'] # No categorical features # In[4]:
# ## Exploratory data analysis (EDA) and Preprocessing # # First, explore and make a description of features without modification, check for distributions, correlations, issues, patterns, etc. # # Once analyzed the issues on numerical and categorical features, the next step is to modify their values appropiately. # ### Read dataset # In[94]: import tools.eda as eda import tools.preprocess as prep # url = 'https://raw.githubusercontent.com/gusseppe/master_artificial_intelligence/master/Introduction_to_Machine_Learning/deliverables/work1/iml/datasets/cmc.arff' path = 'datasets/cmc.arff' df = eda.read_arff(path_data=path) # local # df = eda(path_data='datasets/cmc.arff') # local df.head() # In[95]: cat_features = [ 'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation', 'living_index', 'media_exposure' ] splits, metadata = eda.split(df, cat_features=cat_features, response='class') X_num = splits['X_num'] X_cat = splits['X_cat'] X_num.head()