コード例 #1
0
def cluster_clients(k=None, save_centroids=True, save_clusters=True):
    '''
    Runs k-prototypes clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.")
        raw_df = load_raw_data(cfg)
        client_df = prepare_for_clustering(cfg, raw_df,  save_df=False)
    excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE']
    client_df.drop(excluded_feats, axis=1, inplace=True)   # Features we don't want to see in clustering
    client_feats_df = client_df.copy()
    client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist()
    cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats]
    bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats]
    ordinal_encoder = OrdinalEncoder()
    client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats])
    X = np.array(client_df)

    # Get list of categorical feature indices. Boolean feats are considered categorical for clustering
    cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df]
    numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs]

    # Normalize noncategorical features
    X_noncat = X[:, numcl_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, numcl_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0),
                                np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \
        k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0),
                                                     np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters})
    clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left')
    clusters_df.set_index('CONTRACT_ACCOUNT')

    # Get centroids of clusters
    cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1]  # Categorical features

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns))
    for i in range(len(cat_feats)):
        ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))}
        centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict)
    centroids_df[bool_feats] = centroids_df[bool_feats].round()
    cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Save centroid features and cluster assignments to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False, index=False)
    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False, index=False)
    return k_prototypes
コード例 #2
0
def cluster_clients(k=None,
                    save_centroids=True,
                    save_clusters=True,
                    explain_centroids=True):
    '''
    Runs k-prototype clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :param explain_centroids: Boolean indicating whether to compute LIME explanations for cluster centroids
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['PROCESSED_DATA'] +
              ". Run preprocessing script before running this script.")
        return
    client_ids = df.pop('ClientID').tolist()
    if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
        dates = df.pop('Date').tolist()
    df.drop('GroundTruth', axis=1, inplace=True)
    X = np.array(df)

    # Load feature info
    try:
        data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r'))
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['DATA_INFO'] +
              ". Run preprocessing script before running this script.")
        return

    # Get list of categorical feature indices
    noncat_feat_idxs = [
        df.columns.get_loc(c) for c in data_info['NON_CAT_FEATURES'] if c in df
    ]
    cat_feat_idxs = [
        i for i in range(len(df.columns)) if i not in noncat_feat_idxs
    ]

    # Normalize noncategorical features
    X_noncat = X[:, noncat_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, noncat_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k,
                               verbose=1,
                               n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'],
                               init='Cao',
                               num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[noncat_feat_idxs], axis=0), np.expand_dims(x1[noncat_feat_idxs], axis=0)) + \
            k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
        clusters_df = pd.DataFrame({
            'ClientID': client_ids,
            'Date': dates,
            'Cluster Membership': client_clusters
        })
        clusters_df.set_index(['ClientID', 'Date'])
    else:
        clusters_df = pd.DataFrame({
            'ClientID': client_ids,
            'Cluster Membership': client_clusters
        })
        clusters_df.set_index('ClientID')

    # Get centroids of clusters
    cluster_centroids = np.zeros((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, noncat_feat_idxs] = k_prototypes.cluster_centroids_[
        0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[
        1]  # Categorical features
    #cluster_centroids = np.concatenate((k_prototypes.cluster_centroids_[0], k_prototypes.cluster_centroids_[1]), axis=1)

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, noncat_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, noncat_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    cluster_centroids = np.rint(
        cluster_centroids)  # Round centroids to nearest int
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(df.columns))
    for i in range(len(data_info['SV_CAT_FEATURE_IDXS'])):
        idx = data_info['SV_CAT_FEATURE_IDXS'][i]
        ordinal_encoded_vals = cluster_centroids[:, idx].astype(int)
        original_vals = [
            data_info['SV_CAT_VALUES'][idx][v] for v in ordinal_encoded_vals
        ]
        centroids_df[data_info['SV_CAT_FEATURES'][i]] = original_vals
    cluster_num_series = pd.Series(np.arange(1,
                                             cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(
        client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Load objects necessary for prediction and explanations
    try:
        scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER'])
        ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV'])
        explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb'))
        model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False)
    except FileNotFoundError as not_found_err:
        print(
            'File "' + not_found_err.filename +
            '" was not found. Ensure you have trained a model and run LIME before running this script.'
        )
        return

    # Add model's prediction of centroids (classes and prediction probabilities) to the DataFrame
    predicted_classes = []
    prediction_probs = []
    print("Obtaining model's predictions for cluster centroids.")
    for i in tqdm(range(len(cluster_centroids))):
        x = np.expand_dims(cluster_centroids[i], axis=0)
        y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T,
                       axis=1)  # Predict centroid
        prediction = 1 if y[1] >= cfg['PREDICTION'][
            'THRESHOLD'] else 0  # Model's classification
        predicted_class = cfg['PREDICTION']['CLASS_NAMES'][prediction]
        predicted_classes.append(predicted_class)
        prediction_probs.append(y[1] * 100)  # Include as a percentage
    centroids_df.insert(centroids_df.shape[1],
                        'At risk of chronic homelessness',
                        pd.Series(predicted_classes))
    centroids_df.insert(centroids_df.shape[1],
                        'Probability of chronic homelessness [%]',
                        pd.Series(prediction_probs))

    # Predict and explain the cluster centroids
    if explain_centroids:
        model_def = cfg['TRAIN']['MODEL_DEF'].upper()
        NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES']
        NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES']
        exp_rows = []
        explanations = []
        print('Creating explanations for cluster centroids.')
        for i in tqdm(range(cluster_centroids.shape[0])):
            row = []
            exp = predict_and_explain(cluster_centroids[i], model, explainer,
                                      ohe_ct_sv, scaler_ct, NUM_FEATURES,
                                      NUM_SAMPLES)
            explanations.append(exp)
            exp_tuples = exp.as_list()
            for exp_tuple in exp_tuples:
                row.extend(list(exp_tuple))
            if len(exp_tuples) < NUM_FEATURES:
                row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples)))
                           )  # Fill with empty space if explanation too small
            exp_rows.append(row)
        exp_col_names = []
        for i in range(NUM_FEATURES):
            exp_col_names.extend(
                ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)])
        exp_df = pd.DataFrame(exp_rows, columns=exp_col_names)
        centroids_df = pd.concat(
            [centroids_df, exp_df], axis=1,
            sort=False)  # Concatenate client features and explanations

        # Visualize clusters' LIME explanations
        predictions = centroids_df[[
            'At risk of chronic homelessness',
            'Probability of chronic homelessness [%]'
        ]].to_numpy()
        visualize_cluster_explanations(
            explanations, predictions, cluster_freqs,
            'Explanations for k-prototypes clusters',
            cfg['PATHS']['IMAGES'] + 'centroid_explanations_')

    # Save centroid features and explanations to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] +
                            datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False,
                            index=False)

    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] +
                           datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False,
                           index=False)
    return k_prototypes