def clean_sick(df, encoder=None): """ Clean mixed data: cmc. """ cat_features = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source' ] # for sick dataset response = 'Class' # for sick dataset splits, metadata = eda.split(df, cat_features=cat_features, response=response) X_num = splits['X_num'] X_num.drop(['TBG'], axis=1, inplace=True) #print(X_num) X_cat = splits['X_cat'] #print(X_cat) y = splits['y'][response].values # Drop columns with many nan # Replace values by the median of the column from sklearn.impute import SimpleImputer imp_mean = SimpleImputer( strategy='median') #for median imputation replace 'mean' with 'median' imp_mean.fit(X_num) X_num_no_nan = imp_mean.transform(X_num) #print(X_num_no_nan) # The data set is converted to data frame again X_num_ok = pd.DataFrame(X_num_no_nan, columns=X_num.columns) # Scaling X_num_scaled = (X_num_ok - X_num_ok.min()) / (X_num_ok.max() - X_num_ok.min()) pd.options.mode.chained_assignment = None # Encoding X_cat_encoded, encoder = prep.encode(X_cat, encoder) return pd.DataFrame(X_num_scaled), pd.DataFrame( X_cat_encoded), pd.DataFrame(y), encoder
def clean_sick2(df): """ Clean mixed data: cmc. """ cat_features = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'referral_source' ] # for sick dataset response = 'Class' # for sick dataset splits, metadata = eda.split(df, cat_features=cat_features, response=response) X_num = splits['X_num'] X_cat = splits['X_cat'] # Drop columns with many nan X_num.drop(['TBG'], axis=1, inplace=True) X_num = X_num.fillna(X_num.mean()) # Outliers # print(f'# Samples before removing outliers: {len(X_num)}') rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1) X_num = X_num[rows_to_remove].copy() # print(f'# Samples after removing outliers: {len(X_num)}') # y = splits['y'][response].values y = splits['y'][rows_to_remove][response].values # Scaling X_num_scaled = prep.scale(X_num) # Removing categ. levels # X_cat = X_cat[rows_to_remove].copy() pd.options.mode.chained_assignment = None # X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2 # X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3 # Encoding X_cat_encoded = prep.encode(X_cat) return X_num_scaled, X_cat_encoded, y
def clean_cmc(df): """ Clean mixed data: cmc. """ cat_features = [ 'weducation', 'heducation', 'wreligion', 'wworking', 'hoccupation', 'living_index', 'media_exposure' ] splits, metadata = eda.split(df, cat_features=cat_features, response='class') X_num = splits['X_num'] X_cat = splits['X_cat'] # Outliers print(f'# Samples before removing outliers: {len(X_num)}') rows_to_remove = (np.abs(stats.zscore(X_num)) < 3).all(axis=1) X_num = X_num[rows_to_remove].copy() print(f'# Samples after removing outliers: {len(X_num)}') y = splits['y'][rows_to_remove]['class'].values # Scaling X_num_scaled = prep.scale(X_num) # Removing categ. levels X_cat = X_cat[rows_to_remove].copy() pd.options.mode.chained_assignment = None X_cat.loc[X_cat['heducation'] == 1, 'heducation'] = 2 X_cat.loc[X_cat['hoccupation'] == 4, 'hoccupation'] = 3 # Encoding X_cat_encoded = prep.encode(X_cat) return X_num_scaled, X_cat_encoded, y
target = 'Class' features = [col for col in df.columns if col != target] # ## Encoding # In[5]: from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() y_encoded = pd.DataFrame(label_encoder.fit_transform(df[target]), columns=['Class']) df_OHE = prep.encode(df[features]) df_OHE.head() # ## PCA (own implementation) # # In[6]: X = df_OHE.values.T N = X.shape[1] d = X.shape[0] # In[7]: means = np.array(np.mean(X, axis=1)).reshape((d, 1))
# # First, find binary categorical features as they don't need to be encoded (they already have 2 unique values). # # # Then, encoding the remaining categorical features with more than 2 levels (categories). # # The chosen method is One Hot Encoder. That's because, the number of categories of each categorical feature is at most 4. That's why sparseness (matrix), as a result of the encoding, will be less intense than having many levels. Binary Encoding approach might be used for the latter case. # # References: # # https://www.datacamp.com/community/tutorials/categorical-data # In[111]: X_cat_encoded = prep.encode(X_cat) X_cat_encoded.head() # Join both numerical and categorical variables # # In[112]: X_scaled_encoded = prep.join_features(X_num_scaled, X_cat_encoded) X_scaled_encoded.head() # ### Visualize the clusters # # In order to visualize the clusters, we need to reduce the dimensionality of **X**. We can reduce the dimensions to 3 or 2 as it is not humanly possible to see dimensions greater than those. #
# As we're dealing with numerical data type we need to transform it to category data type or string as K-modes is used in order to classify categorical data. # In[99]: # Data binning: X_cat = X_num_scaled.copy() X_cat_binned = prep.binning(X_cat, n_classes=2) # In[100]: X_cat_binned.head() # In[101]: # Encode the binned data: X_cat_encoded = prep.encode(X_cat_binned) X_cat_encoded.head() # In[102]: # New data type: X_cat_encoded.dtypes # In[103]: # PCA with categorical data n_comp = 2 prep.graph_components(X_cat_encoded, n_components=n_comp) # we can't visualize right now # In[104]: