def roc_decision_rule(decision_rate, thr, decision_threshold): """ cf : - https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/ 4308261-evaluez-un-algorithme-de-classification-qui-retourne-des-scores """ idx = np.min(np.where(decision_rate > decision_threshold)) threshold = thr[idx] return threshold, idx
def scree_plot(self, threshold=None, save_as_img=False): # (% Explained Variance) """ """ scree = self.evr * 100 plt.bar(np.arange(len(scree)) + 1, scree) if threshold is not None: scree_freq = scree / scree.sum() scree_cumsum = np.cumsum(scree_freq) # Number of features needed for threshold cumulative importance n_features = np.min(np.where(scree_cumsum > threshold)) + 1 threshold_percentage = 100 * threshold threshold_legend = '{} features required for {:.0f}% of inertia.'.format( n_features, threshold_percentage) # Threshold vertical line plot plt.vlines(n_features, ymin=0, ymax=threshold_percentage, linestyles='--', colors='red') plt.plot(np.arange(len(scree)) + 1, scree.cumsum(), c="red", marker='o', label=threshold_legend) plt.legend(loc='lower right', fontsize=12) else: plt.plot(np.arange(len(scree)) + 1, scree.cumsum(), c="red", marker='o') plt.xlabel("Inertia axis rank", labelpad=20) plt.ylabel("Inertia (%)", labelpad=20) plt.title("Scree plot" + "\n(Kaiser criterion = {} : Elbow criterion = {})".format( self.kaiser_criterion(), elbow_criterion(total_inertia=self.evr)), pad=20) if save_as_img: plt.tight_layout() plt.savefig('scree.jpg') plt.show(block=False)
def elbow_criterion(total_inertia, threshold=0.25): """ Find total components/clusters number based on Elbow criterion : (cf : https://en.wikipedia.org/wiki/Elbow_method_(clustering)) """ features_nb = len(total_inertia) var_cumsum = total_inertia.cumsum() # Compute variations ratio from cumulated explained variance values variations = [ abs(percentage_change(var_cumsum[i + 1], x)) for i, x in enumerate(var_cumsum) if i + 1 < features_nb ] # Get total components selected if threshold is 'min': n_selected = variations.index(min(variations)) + 1 elif type(threshold) is float: variations = np.array(variations) n_selected = np.min( np.where(variations <= np.quantile(variations, q=threshold))) + 1 return n_selected