def cluster_clients(k=None, save_centroids=True, save_clusters=True): ''' Runs k-prototypes clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.") raw_df = load_raw_data(cfg) client_df = prepare_for_clustering(cfg, raw_df, save_df=False) excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE'] client_df.drop(excluded_feats, axis=1, inplace=True) # Features we don't want to see in clustering client_feats_df = client_df.copy() client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist() cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats] bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats] ordinal_encoder = OrdinalEncoder() client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats]) X = np.array(client_df) # Get list of categorical feature indices. Boolean feats are considered categorical for clustering cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df] numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs] # Normalize noncategorical features X_noncat = X[:, numcl_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, numcl_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0), np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters}) clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left') clusters_df.set_index('CONTRACT_ACCOUNT') # Get centroids of clusters cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1] # Categorical features # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns)) for i in range(len(cat_feats)): ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))} centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict) centroids_df[bool_feats] = centroids_df[bool_feats].round() cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Save centroid features and cluster assignments to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes
def cluster_clients(k=None, save_centroids=True, save_clusters=True, explain_centroids=True): ''' Runs k-prototype clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :param explain_centroids: Boolean indicating whether to compute LIME explanations for cluster centroids :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['PROCESSED_DATA'] + ". Run preprocessing script before running this script.") return client_ids = df.pop('ClientID').tolist() if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': dates = df.pop('Date').tolist() df.drop('GroundTruth', axis=1, inplace=True) X = np.array(df) # Load feature info try: data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r')) except FileNotFoundError: print("No file found at " + cfg['PATHS']['DATA_INFO'] + ". Run preprocessing script before running this script.") return # Get list of categorical feature indices noncat_feat_idxs = [ df.columns.get_loc(c) for c in data_info['NON_CAT_FEATURES'] if c in df ] cat_feat_idxs = [ i for i in range(len(df.columns)) if i not in noncat_feat_idxs ] # Normalize noncategorical features X_noncat = X[:, noncat_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, noncat_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[noncat_feat_idxs], axis=0), np.expand_dims(x1[noncat_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': clusters_df = pd.DataFrame({ 'ClientID': client_ids, 'Date': dates, 'Cluster Membership': client_clusters }) clusters_df.set_index(['ClientID', 'Date']) else: clusters_df = pd.DataFrame({ 'ClientID': client_ids, 'Cluster Membership': client_clusters }) clusters_df.set_index('ClientID') # Get centroids of clusters cluster_centroids = np.zeros((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, noncat_feat_idxs] = k_prototypes.cluster_centroids_[ 0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[ 1] # Categorical features #cluster_centroids = np.concatenate((k_prototypes.cluster_centroids_[0], k_prototypes.cluster_centroids_[1]), axis=1) # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, noncat_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, noncat_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids cluster_centroids = np.rint( cluster_centroids) # Round centroids to nearest int centroids_df = pd.DataFrame(cluster_centroids, columns=list(df.columns)) for i in range(len(data_info['SV_CAT_FEATURE_IDXS'])): idx = data_info['SV_CAT_FEATURE_IDXS'][i] ordinal_encoded_vals = cluster_centroids[:, idx].astype(int) original_vals = [ data_info['SV_CAT_VALUES'][idx][v] for v in ordinal_encoded_vals ] centroids_df[data_info['SV_CAT_FEATURES'][i]] = original_vals cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float( client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Load objects necessary for prediction and explanations try: scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER']) ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV']) explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb')) model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False) except FileNotFoundError as not_found_err: print( 'File "' + not_found_err.filename + '" was not found. Ensure you have trained a model and run LIME before running this script.' ) return # Add model's prediction of centroids (classes and prediction probabilities) to the DataFrame predicted_classes = [] prediction_probs = [] print("Obtaining model's predictions for cluster centroids.") for i in tqdm(range(len(cluster_centroids))): x = np.expand_dims(cluster_centroids[i], axis=0) y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T, axis=1) # Predict centroid prediction = 1 if y[1] >= cfg['PREDICTION'][ 'THRESHOLD'] else 0 # Model's classification predicted_class = cfg['PREDICTION']['CLASS_NAMES'][prediction] predicted_classes.append(predicted_class) prediction_probs.append(y[1] * 100) # Include as a percentage centroids_df.insert(centroids_df.shape[1], 'At risk of chronic homelessness', pd.Series(predicted_classes)) centroids_df.insert(centroids_df.shape[1], 'Probability of chronic homelessness [%]', pd.Series(prediction_probs)) # Predict and explain the cluster centroids if explain_centroids: model_def = cfg['TRAIN']['MODEL_DEF'].upper() NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES'] NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES'] exp_rows = [] explanations = [] print('Creating explanations for cluster centroids.') for i in tqdm(range(cluster_centroids.shape[0])): row = [] exp = predict_and_explain(cluster_centroids[i], model, explainer, ohe_ct_sv, scaler_ct, NUM_FEATURES, NUM_SAMPLES) explanations.append(exp) exp_tuples = exp.as_list() for exp_tuple in exp_tuples: row.extend(list(exp_tuple)) if len(exp_tuples) < NUM_FEATURES: row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples))) ) # Fill with empty space if explanation too small exp_rows.append(row) exp_col_names = [] for i in range(NUM_FEATURES): exp_col_names.extend( ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)]) exp_df = pd.DataFrame(exp_rows, columns=exp_col_names) centroids_df = pd.concat( [centroids_df, exp_df], axis=1, sort=False) # Concatenate client features and explanations # Visualize clusters' LIME explanations predictions = centroids_df[[ 'At risk of chronic homelessness', 'Probability of chronic homelessness [%]' ]].to_numpy() visualize_cluster_explanations( explanations, predictions, cluster_freqs, 'Explanations for k-prototypes clusters', cfg['PATHS']['IMAGES'] + 'centroid_explanations_') # Save centroid features and explanations to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes