Esempio n. 1
0
                    le_dict[colname] = deepcopy(le)

            # Encode ordinal data, modalities have been sorted (at best)

            ord_le = LabelEncoder()
            train['education.num'] = ord_le.fit_transform(
                train['education.num'])
            le_dict['education.num'] = deepcopy(ord_le)

            # Encode capital.gain and capital.loss and capital.gain as ordinal variables
            for col in ['capital.gain', 'capital.loss']:
                le = LabelEncoder()
                train[col] = le.fit_transform(train[col])
                le_dict[col] = deepcopy(le)

            nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib)
            nb_cont = np.sum(var_distrib == 'continuous')

            # Feature category (cf)
            dtype = {
                train.columns[j]: dtypes_dict[var_distrib[j]]
                for j in range(p)
            }

            train = train.astype(dtype, copy=True)
            numobs = len(train)

            # Defining distances over the features
            dm = gower_matrix(train, cat_features=cat_features)

            #*****************************************************************
Esempio n. 2
0
for col_idx, colname in enumerate(full_contra.columns):
    if var_distrib[col_idx] == 'bernoulli': 
        le = LabelEncoder()
        full_contra[colname] = le.fit_transform(full_contra[colname])
        #le_dict[colname] = deepcopy(le)
        
# Encode ordinal data
for col_idx, colname in enumerate(full_contra.columns):
    if var_distrib[col_idx] == 'ordinal': 
        le = LabelEncoder()
        full_contra[colname] = le.fit_transform(full_contra[colname])
        #le_dict[colname] = deepcopy(le)
           
#y = y.where(~nan_mask, np.nan)

nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib)
nb_cont = np.sum(var_distrib == 'continuous')

p_new = full_contra.shape[1]

# Feature category (cf)
dtype = {full_contra.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p_new)}
full_contra = full_contra.astype(dtype, copy=True)

# Feature category (cf)
cat_features = var_distrib == 'categorical'

# Defining distance matrix
dm3 = gower_matrix(full_contra, cat_features = cat_features) 

#===========================================#
Esempio n. 3
0
#===========================================#
var_distrib = np.array(['ordinal', 'continuous', 'continuous', 'continuous',\
                        'continuous', 'continuous', 'continuous', 'continuous'])

# Ordinal data already encoded

y_categ_non_enc = deepcopy(y)
vd_categ_non_enc = deepcopy(var_distrib)

# No categ data
# No binary data

enc = OneHotEncoder(sparse=False, drop='first')
labels_oh = enc.fit_transform(np.array(labels).reshape(-1, 1)).flatten()

nj, nj_bin, nj_ord, n_categ = compute_nj(y, var_distrib)
y_np = y.values
nb_cont = np.sum(var_distrib == 'continuous')

p_new = y.shape[1]

# Feature category (cf)
cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical',
                           vd_categ_non_enc == 'bernoulli')

# Non encoded version of the dataset:
y_nenc_typed = y_categ_non_enc.astype(np.object)
y_np_nenc = y_nenc_typed.values

# Defining distances over the non encoded features
dm = gower_matrix(y_nenc_typed, cat_features=cf_non_enc)