le_dict[colname] = deepcopy(le) # Encode ordinal data, modalities have been sorted (at best) ord_le = LabelEncoder() train['education.num'] = ord_le.fit_transform( train['education.num']) le_dict['education.num'] = deepcopy(ord_le) # Encode capital.gain and capital.loss and capital.gain as ordinal variables for col in ['capital.gain', 'capital.loss']: le = LabelEncoder() train[col] = le.fit_transform(train[col]) le_dict[col] = deepcopy(le) nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') # Feature category (cf) dtype = { train.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p) } train = train.astype(dtype, copy=True) numobs = len(train) # Defining distances over the features dm = gower_matrix(train, cat_features=cat_features) #*****************************************************************
for col_idx, colname in enumerate(full_contra.columns): if var_distrib[col_idx] == 'bernoulli': le = LabelEncoder() full_contra[colname] = le.fit_transform(full_contra[colname]) #le_dict[colname] = deepcopy(le) # Encode ordinal data for col_idx, colname in enumerate(full_contra.columns): if var_distrib[col_idx] == 'ordinal': le = LabelEncoder() full_contra[colname] = le.fit_transform(full_contra[colname]) #le_dict[colname] = deepcopy(le) #y = y.where(~nan_mask, np.nan) nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') p_new = full_contra.shape[1] # Feature category (cf) dtype = {full_contra.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p_new)} full_contra = full_contra.astype(dtype, copy=True) # Feature category (cf) cat_features = var_distrib == 'categorical' # Defining distance matrix dm3 = gower_matrix(full_contra, cat_features = cat_features) #===========================================#
#===========================================# var_distrib = np.array(['ordinal', 'continuous', 'continuous', 'continuous',\ 'continuous', 'continuous', 'continuous', 'continuous']) # Ordinal data already encoded y_categ_non_enc = deepcopy(y) vd_categ_non_enc = deepcopy(var_distrib) # No categ data # No binary data enc = OneHotEncoder(sparse=False, drop='first') labels_oh = enc.fit_transform(np.array(labels).reshape(-1, 1)).flatten() nj, nj_bin, nj_ord, n_categ = compute_nj(y, var_distrib) y_np = y.values nb_cont = np.sum(var_distrib == 'continuous') p_new = y.shape[1] # Feature category (cf) cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical', vd_categ_non_enc == 'bernoulli') # Non encoded version of the dataset: y_nenc_typed = y_categ_non_enc.astype(np.object) y_np_nenc = y_nenc_typed.values # Defining distances over the non encoded features dm = gower_matrix(y_nenc_typed, cat_features=cf_non_enc)