Beispiel #1
0
def clean_sick(df, encoder=None):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'sex', 'on_thyroxine', 'query_on_thyroxine',
        'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
        'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
        'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
        'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
        'TBG_measured', 'referral_source'
    ]  # for sick dataset
    response = 'Class'  # for sick dataset

    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response=response)
    X_num = splits['X_num']
    X_num.drop(['TBG'], axis=1, inplace=True)
    #print(X_num)

    X_cat = splits['X_cat']
    #print(X_cat)

    y = splits['y'][response].values

    # Drop columns with many nan

    # Replace values by the median of the column
    from sklearn.impute import SimpleImputer
    imp_mean = SimpleImputer(
        strategy='median')  #for median imputation replace 'mean' with 'median'
    imp_mean.fit(X_num)
    X_num_no_nan = imp_mean.transform(X_num)
    #print(X_num_no_nan)

    # The data set is converted to data frame again
    X_num_ok = pd.DataFrame(X_num_no_nan, columns=X_num.columns)

    # Scaling
    X_num_scaled = (X_num_ok - X_num_ok.min()) / (X_num_ok.max() -
                                                  X_num_ok.min())

    pd.options.mode.chained_assignment = None

    # Encoding
    X_cat_encoded, encoder = prep.encode(X_cat, encoder)

    return pd.DataFrame(X_num_scaled), pd.DataFrame(
        X_cat_encoded), pd.DataFrame(y), encoder
Beispiel #2
0
def clean_sick2(df):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'sex', 'on_thyroxine', 'query_on_thyroxine',
        'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
        'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
        'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
        'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured',
        'TBG_measured', 'referral_source'
    ]  # for sick dataset
    response = 'Class'  # for sick dataset
    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response=response)
    X_num = splits['X_num']
    X_cat = splits['X_cat']

    # Drop columns with many nan
    X_num.drop(['TBG'], axis=1, inplace=True)
    X_num = X_num.fillna(X_num.mean())
    # Outliers
    # print(f'# Samples before removing outliers: {len(X_num)}')
    rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1)
    X_num = X_num[rows_to_remove].copy()
    # print(f'# Samples after removing outliers: {len(X_num)}')
    # y = splits['y'][response].values
    y = splits['y'][rows_to_remove][response].values

    # Scaling
    X_num_scaled = prep.scale(X_num)

    # Removing categ. levels
    # X_cat = X_cat[rows_to_remove].copy()
    pd.options.mode.chained_assignment = None

    # X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2
    # X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3

    # Encoding
    X_cat_encoded = prep.encode(X_cat)

    return X_num_scaled, X_cat_encoded, y
Beispiel #3
0
def clean_cmc(df):
    """
        Clean mixed data: cmc.
    """

    cat_features = [
        'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation',
        'living_index', 'media_exposure'
    ]

    splits, metadata = eda.split(df,
                                 cat_features=cat_features,
                                 response='class')
    X_num = splits['X_num']
    X_cat = splits['X_cat']

    # Outliers
    print(f'# Samples before removing outliers: {len(X_num)}')
    rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1)
    X_num = X_num[rows_to_remove].copy()
    print(f'# Samples after removing outliers: {len(X_num)}')
    y = splits['y'][rows_to_remove]['class'].values

    # Scaling
    X_num_scaled = prep.scale(X_num)

    # Removing categ. levels
    X_cat = X_cat[rows_to_remove].copy()
    pd.options.mode.chained_assignment = None

    X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2
    X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3

    # Encoding
    X_cat_encoded = prep.encode(X_cat)

    return X_num_scaled, X_cat_encoded, y
target = 'Class'
features = [col for col in df.columns if col != target]

# ## Encoding

# In[5]:

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = pd.DataFrame(label_encoder.fit_transform(df[target]),
                         columns=['Class'])

df_OHE = prep.encode(df[features])
df_OHE.head()

# ## PCA (own implementation)
#

# In[6]:

X = df_OHE.values.T
N = X.shape[1]
d = X.shape[0]

# In[7]:

means = np.array(np.mean(X, axis=1)).reshape((d, 1))
#

# First, find binary categorical features as they don't need to be encoded (they already have 2 unique values).
#
#
# Then, encoding the remaining categorical features with more than 2 levels (categories).
#
# The chosen method is One Hot Encoder. That's because, the number of categories of each categorical feature is at most 4. That's why sparseness (matrix), as a result of the encoding, will be less intense than having many levels. Binary Encoding approach might be used for the latter case.
#
# References:
#
# https://www.datacamp.com/community/tutorials/categorical-data

# In[111]:

X_cat_encoded = prep.encode(X_cat)
X_cat_encoded.head()

# Join both numerical and categorical variables
#

# In[112]:

X_scaled_encoded = prep.join_features(X_num_scaled, X_cat_encoded)
X_scaled_encoded.head()

# ### Visualize the clusters
#

# In order to visualize the clusters, we need to reduce the dimensionality of **X**. We can reduce the dimensions to 3 or 2 as it is not humanly possible to see dimensions greater than those.
#
# As we're dealing with numerical data type we need to transform it to category data type or string as K-modes is used in order to classify categorical data.

# In[99]:

# Data binning:
X_cat = X_num_scaled.copy()
X_cat_binned = prep.binning(X_cat, n_classes=2)

# In[100]:

X_cat_binned.head()

# In[101]:

# Encode the binned data:
X_cat_encoded = prep.encode(X_cat_binned)
X_cat_encoded.head()

# In[102]:

# New data type:
X_cat_encoded.dtypes

# In[103]:

# PCA with categorical data
n_comp = 2
prep.graph_components(X_cat_encoded,
                      n_components=n_comp)  # we can't visualize right now

# In[104]: