def read_train_test_files():

    train_arff_files = glob.glob(
        '../datasets/datasetsCBR/sick/*.train.arff')  # for sick dataset
    #     train_arff_files = glob.glob('../datasets/datasetsCBR/sick/*.train.arff') # for bal dataset
    #     test_arff_files = glob.glob('../datasets/datasetsCBR/bal/*.test.arff') # for bal dataset
    test_arff_files = glob.glob(
        '../datasets/datasetsCBR/sick/*.test.arff')  # for sick dataset

    train_test_split = []
    for train_file, test_file in zip(train_arff_files, test_arff_files):

        # Train
        df_train = eda.read_arff(path_data=train_file, url_data=None)
        X_num_train, X_cat_train, y_train, encoder_train = all_steps.clean_sick(
            df_train)
        X_train = prep.join_features(X_num_train, X_cat_train)

        # Test
        df_test = eda.read_arff(path_data=test_file, url_data=None)
        X_num_test, X_cat_test, y_test, encoder_test = all_steps.clean_sick(
            df_train, encoder_train)
        X_test = prep.join_features(X_num_test, X_cat_test)

        train_test_split.append(
            (X_train.values, y_train.values.reshape(-1, ), X_test.values,
             y_test.values.reshape(-1, )))

    return train_test_split
def read_train_test_files(fold_number):
    import glob
    train_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.train.arff')
    test_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.test.arff')
#     test_arff_files = glob.glob('datasetsCBR/pen-based/*.test.arff')
    
    TrainTotal = []
    Y_TrainTotal = []
    
    TestTotal = []
    Y_TestTotal = []
    
    for file in train_arff_files:
        

        df_train = eda.read_arff(path_data=file, url_data=None)
        splits, metadata = eda.split(df_train, cat_features=None,response='a17')
        X_num = splits['X_num']
        X_cat = splits['X_cat'] # No categorical features
        y_train = splits['y']['a17'].values
        X_norm_train = (X_num - X_num.min()) / (X_num.max() - X_num.min())
        TrainTotal.append(X_norm_train)
        Y_TrainTotal.append(y_train)
        
    
    for file in test_arff_files: 
       
        df_test = eda.read_arff(path_data=file, url_data=None)
        splits, metadata = eda.split(df_test, cat_features=None,response='a17')
        X_num = splits['X_num']
        X_cat = splits['X_cat'] # No categorical features
        y_test = splits['y']['a17'].values
        X_norm_test = (X_num - X_num.min()) / (X_num.max() - X_num.min())
        TestTotal.append(X_norm_test)
        Y_TestTotal.append(y_test)
        
        
    return TrainTotal[fold_number-1],Y_TrainTotal[fold_number-1], TestTotal[fold_number-1], Y_TestTotal[fold_number-1]
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy.io import arff
from scipy import stats

import tools.eda as eda
import tools.preprocess as prep

from sklearn.metrics import adjusted_mutual_info_score, mutual_info_score,                             silhouette_score, adjusted_rand_score,                             f1_score, davies_bouldin_score, confusion_matrix,                             accuracy_score

path = 'datasets/splice.arff'

# Parse into pandas DataFrame
df = eda.read_arff(path)
df_original = df.copy()
df.head()


# Dataset shape
# 

# In[3]:


target = 'Class'
y = df_original[target]

print('Num rows:', len(df))
print('Num cols:', len(df.columns))
from io import StringIO
from sklearn.preprocessing import LabelEncoder

from tools import eda 
from tools import preprocess as prep


# ### Read an example of the Pen-Based data set

# In[2]:


path = '../datasets/datasetsCBR/pen-based/pen-based.fold.000000.test.arff'

# Read the data set
df_test = eda.read_arff(path_data=path, url_data=None)

df_test.head()


# In[3]:


splits, metadata = eda.split(df_test, cat_features=None,response='a17')
X_num = splits['X_num']
X_cat = splits['X_cat'] # No categorical features


# In[4]:

コード例 #5
0
# ## Exploratory data analysis (EDA) and Preprocessing
#
# First, explore and make a description of features without modification, check for distributions, correlations, issues, patterns, etc.
#
# Once analyzed the issues on numerical and categorical features, the next step is to modify their values appropiately.

# ### Read dataset

# In[94]:

import tools.eda as eda
import tools.preprocess as prep

# url = 'https://raw.githubusercontent.com/gusseppe/master_artificial_intelligence/master/Introduction_to_Machine_Learning/deliverables/work1/iml/datasets/cmc.arff'
path = 'datasets/cmc.arff'
df = eda.read_arff(path_data=path)  # local
# df = eda(path_data='datasets/cmc.arff') # local
df.head()

# In[95]:

cat_features = [
    'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation',
    'living_index', 'media_exposure'
]

splits, metadata = eda.split(df, cat_features=cat_features, response='class')
X_num = splits['X_num']
X_cat = splits['X_cat']

X_num.head()