Beispiel #1
0
def get_knee_results(data, cluster_lims, cores, categorical):

    knee_results = []
    cluster_range = range(*cluster_lims)
    for n_clusters in tqdm(cluster_range):

        kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores)
        kp.fit(data[cols], categorical=categorical)

        knee_results.append(kp.cost_)

    kl = KneeLocator(
        cluster_range,
        knee_results,
        curve_nature="convex",
        curve_direction="decreasing",
    )

    n_clusters = kl.knee

    with open(OUT_DIR / "n_clusters.txt", "w") as f:
        f.write(str(n_clusters))

    knee_results = pd.Series(index=cluster_range, data=knee_results)
    knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False)

    return n_clusters
Beispiel #2
0
 def choose_from_metric(metric, k_values):
     if chosen_k is not None:
         return chosen_k
     knee_locator = KneeLocator(k_values, metric)
     if knee_locator.knee is None:
         return k_values[numpy.argmax(metric)]
     else:
         return knee_locator.knee
Beispiel #3
0
 def choose_from_metric(metric, k_values):
     if chosen_k is not None:
         return chosen_k
     knee_locator = KneeLocator(k_values,
                                metric,
                                curve_nature='convex',
                                curve_direction='decreasing')
     if knee_locator.knee is None:
         return k_values[numpy.argmin(metric)]
     else:
         return knee_locator.knee
Beispiel #4
0
def run_decision_tree_dimensionality_reduction(name,
                                               features,
                                               classes,
                                               min_k=2,
                                               max_k=None,
                                               min_depth=2,
                                               max_depth=None,
                                               chosen_k=None,
                                               random_state=6126540):
    if max_k is None:
        max_k = features.shape[1] - 1
    if max_depth is None:
        max_depth = features.shape[1]

    scores = []
    k_values = []
    k_value_strings = []
    k_to_depth = {}
    for k, depth, score in get_k_depth_values(features, classes, min_depth,
                                              max_depth, random_state):
        k_values.append(k)
        scores.append(score)
        k_to_depth[k] = depth
        k_value_strings.append('{}({})'.format(k, depth))
    print('.')

    if chosen_k is not None:
        best_k = chosen_k
    else:
        knee_locator = KneeLocator(k_values, scores)
        if knee_locator.knee is None:
            best_k = k_values[numpy.argmax(scores)]
        else:
            best_k = knee_locator.knee

    plot_metric(name, 'f1 score', scores, k_values, best_k)

    best_depth = k_to_depth[best_k]
    transformer = DecisionTreeDimReducer(best_depth, random_state)
    transformer.fit(features, classes)
    return transformer
Beispiel #5
0
def plot_elbow(thresholds, num_features_left, auto_pick_elbow=True):
    if auto_pick_elbow:
        try:
            from yellowbrick.utils import KneeLocator
            elbow_locator = KneeLocator(x=thresholds,
                                        y=num_features_left,
                                        curve_nature="convex",
                                        curve_direction="decreasing")
            best_threshold = elbow_locator.knee
            best_index_at = list(thresholds).index(best_threshold)
            best_num_feat = num_features_left[best_index_at]
        except:
            pass
    from matplotlib import rcParams
    plt.figure(figsize=(16, 6))
    #plt.xscale('log', nonposy='clip')
    rcParams.update({'font.size': 16})
    plt.plot(thresholds, num_features_left, '-o')
    if auto_pick_elbow:
        try:
            elbow_label = f"elbow at the={best_threshold} index_at: [{best_index_at}], best_num_feat={best_num_feat}"
            print(elbow_label)
            plt.vlines(best_threshold,
                       color='r',
                       linestyle="--",
                       label=elbow_label,
                       ymin=0,
                       ymax=len(non_zero_features))
            plt.legend(loc="best")
        except:
            pass
    plt.title('Feature Importance')
    plt.ylabel('Num of Features')
    plt.xlabel('Scores threshold')
    pic_name = f'num_feat_vs_score_threshold_{time_produced}.jpg'
    plt.savefig(pic_name)
    print(f"{pic_name} was produced.")
Beispiel #6
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits n KMeans models where n is the length of ``self.k_values_``,
        storing the silhouette scores in the ``self.k_scores_`` attribute.
        The "elbow" and silhouette score corresponding to it are stored in
        ``self.elbow_value`` and ``self.elbow_score`` respectively.
        This method finishes up by calling draw to create the plot.
        """

        self.k_scores_ = []
        self.k_timers_ = []
        self.kneedle = None
        self.knee_value = None

        if self.locate_elbow:
            self.elbow_value_ = None
            self.elbow_score_ = None

        for k in self.k_values_:
            # Compute the start time for each  model
            start = time.time()

            # Set the k value and fit the model
            self.estimator.set_params(n_clusters=k)
            self.estimator.fit(X)

            # Append the time and score to our plottable metrics
            self.k_timers_.append(time.time() - start)
            self.k_scores_.append(
                self.scoring_metric(X, self.estimator.labels_))

        if self.locate_elbow:
            locator_kwargs = {
                "distortion": {
                    "curve_nature": "convex",
                    "curve_direction": "decreasing",
                },
                "silhouette": {
                    "curve_nature": "concave",
                    "curve_direction": "increasing",
                },
                "calinski_harabasz": {
                    "curve_nature": "concave",
                    "curve_direction": "increasing",
                },
            }.get(self.metric, {})
            elbow_locator = KneeLocator(self.k_values_, self.k_scores_,
                                        **locator_kwargs)
            if elbow_locator.knee is None:
                self.elbow_value_ = None
                self.elbow_score_ = 0
                warning_message = (
                    "No 'knee' or 'elbow' point detected, "
                    "pass `locate_elbow=False` to remove the warning")
                warnings.warn(warning_message, YellowbrickWarning)
            else:
                self.elbow_value_ = elbow_locator.knee
                self.elbow_score_ = self.k_scores_[self.k_values_.index(
                    self.elbow_value_)]

        self.draw()

        return self