Exemple #1
0
def clean_sick(df, encoder=None):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'sex', 'on_thyroxine', 'query_on_thyroxine',
        'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
        'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
        'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
        'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
        'TBG_measured', 'referral_source'
    ]  # for sick dataset
    response = 'Class'  # for sick dataset

    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response=response)
    X_num = splits['X_num']
    X_num.drop(['TBG'], axis=1, inplace=True)
    #print(X_num)

    X_cat = splits['X_cat']
    #print(X_cat)

    y = splits['y'][response].values

    # Drop columns with many nan

    # Replace values by the median of the column
    from sklearn.impute import SimpleImputer
    imp_mean = SimpleImputer(
        strategy='median')  #for median imputation replace 'mean' with 'median'
    imp_mean.fit(X_num)
    X_num_no_nan = imp_mean.transform(X_num)
    #print(X_num_no_nan)

    # The data set is converted to data frame again
    X_num_ok = pd.DataFrame(X_num_no_nan, columns=X_num.columns)

    # Scaling
    X_num_scaled = (X_num_ok - X_num_ok.min()) / (X_num_ok.max() -
                                                  X_num_ok.min())

    pd.options.mode.chained_assignment = None

    # Encoding
    X_cat_encoded, encoder = prep.encode(X_cat, encoder)

    return pd.DataFrame(X_num_scaled), pd.DataFrame(
        X_cat_encoded), pd.DataFrame(y), encoder
def read_train_test_files(fold_number):
    import glob
    train_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.train.arff')
    test_arff_files = glob.glob('../datasets/datasetsCBR/pen-based/*.test.arff')
#     test_arff_files = glob.glob('datasetsCBR/pen-based/*.test.arff')
    
    TrainTotal = []
    Y_TrainTotal = []
    
    TestTotal = []
    Y_TestTotal = []
    
    for file in train_arff_files:
        

        df_train = eda.read_arff(path_data=file, url_data=None)
        splits, metadata = eda.split(df_train, cat_features=None,response='a17')
        X_num = splits['X_num']
        X_cat = splits['X_cat'] # No categorical features
        y_train = splits['y']['a17'].values
        X_norm_train = (X_num - X_num.min()) / (X_num.max() - X_num.min())
        TrainTotal.append(X_norm_train)
        Y_TrainTotal.append(y_train)
        
    
    for file in test_arff_files: 
       
        df_test = eda.read_arff(path_data=file, url_data=None)
        splits, metadata = eda.split(df_test, cat_features=None,response='a17')
        X_num = splits['X_num']
        X_cat = splits['X_cat'] # No categorical features
        y_test = splits['y']['a17'].values
        X_norm_test = (X_num - X_num.min()) / (X_num.max() - X_num.min())
        TestTotal.append(X_norm_test)
        Y_TestTotal.append(y_test)
        
        
    return TrainTotal[fold_number-1],Y_TrainTotal[fold_number-1], TestTotal[fold_number-1], Y_TestTotal[fold_number-1]
Exemple #3
0
def clean_sick2(df):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'sex', 'on_thyroxine', 'query_on_thyroxine',
        'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
        'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
        'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
        'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
        'TBG_measured', 'referral_source'
    ]  # for sick dataset
    response = 'Class'  # for sick dataset
    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response=response)
    X_num = splits['X_num']
    X_cat = splits['X_cat']

    # Drop columns with many nan
    X_num.drop(['TBG'], axis=1, inplace=True)
    X_num = X_num.fillna(X_num.mean())
    # Outliers
    # print(f'# Samples before removing outliers: {len(X_num)}')
    rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1)
    X_num = X_num[rows_to_remove].copy()
    # print(f'# Samples after removing outliers: {len(X_num)}')
    # y = splits['y'][response].values
    y = splits['y'][rows_to_remove][response].values

    # Scaling
    X_num_scaled = prep.scale(X_num)

    # Removing categ. levels
    # X_cat = X_cat[rows_to_remove].copy()
    pd.options.mode.chained_assignment = None

    # X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2
    # X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3

    # Encoding
    X_cat_encoded = prep.encode(X_cat)

    return X_num_scaled, X_cat_encoded, y
Exemple #4
0
def clean_cmc(df):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation',
        'living_index', 'media_exposure'
    ]

    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response='class')
    X_num = splits['X_num']
    X_cat = splits['X_cat']

    # Outliers
    print(f'# Samples before removing outliers: {len(X_num)}')
    rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1)
    X_num = X_num[rows_to_remove].copy()
    print(f'# Samples after removing outliers: {len(X_num)}')
    y = splits['y'][rows_to_remove]['class'].values

    # Scaling
    X_num_scaled = prep.scale(X_num)

    # Removing categ. levels
    X_cat = X_cat[rows_to_remove].copy()
    pd.options.mode.chained_assignment = None

    X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2
    X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3

    # Encoding
    X_cat_encoded = prep.encode(X_cat)

    return X_num_scaled, X_cat_encoded, y
# In[2]:


path = '../datasets/datasetsCBR/pen-based/pen-based.fold.000000.test.arff'

# Read the data set
df_test = eda.read_arff(path_data=path, url_data=None)

df_test.head()


# In[3]:


splits, metadata = eda.split(df_test, cat_features=None,response='a17')
X_num = splits['X_num']
X_cat = splits['X_cat'] # No categorical features


# In[4]:


# True labels of all datapoints
y = splits['y']['a17'].values
print(y)
print(len(y))


# In[5]:
import tools.preprocess as prep

# url = 'https://raw.githubusercontent.com/gusseppe/master_artificial_intelligence/master/Introduction_to_Machine_Learning/deliverables/work1/iml/datasets/cmc.arff'
path = 'datasets/cmc.arff'
df = eda.read_arff(path_data=path)  # local
# df = eda(path_data='datasets/cmc.arff') # local
df.head()

# In[95]:

cat_features = [
    'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation',
    'living_index', 'media_exposure'
]

splits, metadata = eda.split(df, cat_features=cat_features, response='class')
X_num = splits['X_num']
X_cat = splits['X_cat']

X_num.head()

# ### Metadata
#

# In[96]:

metadata

# ### Analyze and preprocess
#
# In[71]:

path = 'datasets/breast-w.arff'

# Read the data set
df = eda.read_arff(path_data=path, url_data=None)

df.head()

# ### Split data into numerical features and true label values (class)

# In[72]:

#Split data in Numerical and Categorical
splits, metadata = eda.split(df, cat_features=None, response='Class')
X_num = splits['X_num']
X_cat = splits['X_cat']  # No categorical features

# In[73]:

# True labels of all datapoints
y = splits['y']['Class'].values
X_num.head()

# In[74]:

print(f'# instances: {len(X_num)} | # features: {len(X_num.columns)}')
print(f'# num_features: {len(X_num.columns)}')

# In[75]: