def find_best_treshold(y_true, y_pred, metrics=None, classes=None, scale=100, verbose=False): if classes: classes = to_array(classes) if not isinstance(metrics, list): metrics = [metrics] if len(y_true.shape) == 1: y_true = to_array(y_true) y_true = y_true.reshape(-1, 1) else: y_true = to_ndarray(y_true) if len(y_pred.shape) == 1: y_pred = to_array(y_pred) y_pred = y_pred.reshape(-1, 1) else: y_pred = to_ndarray(y_pred) assert y_true.shape == y_pred.shape, f"y_true and y_pred must have the same dimension, currently: mismatch {y_true.shape} {y_pred.shape}" if classes: assert classes.shape[0] == y_pred.shape[ 1], f"classes and y_true/y_pred columns must have same dimensionality: mismatch classes {classes.shape[0]}, y_pred {y_pred.shape[1]}" else: classes = np.arange(y_pred.shape[1]) def search(y_true: np.array, y_pred: np.array, scale: int, metric_function): # y_true and y_pred as vectors best_score = 0 best_threshold = 0 for t in range(scale): score = metric_function( y_true, np.array(y_pred > float(t / scale), dtype=np.uint8)) if score > best_score: best_threshold = float(t / scale) best_score = score return best_threshold, best_score threhsolds_dict = {} for j in range(y_true.shape[1]): for metric in metrics: best_threshold, best_score = search(y_true[:, j], y_pred[:, j], scale=scale, metric_function=metric) if verbose: print( f"Class {classes[j]} : best score = {best_score} with threshold = {best_threshold}" ) threhsolds_dict[classes[j]] = { "best_score": best_score, "best_threshold": best_threshold } return threhsolds_dict
def plot_grouped_bar(data, labels, series_names, width=0.35, y_label=None, title=None): if isinstance(data, list): data = np.array(data) if isinstance(data, pd.DataFrame): data = data.values labels = to_array(labels) series_names = to_array(series_names) assert len(data.shape) == 2, "Input data must be a 2-dimensional array" assert data.shape[1] == labels.shape[ 0], "Labels and data columns must have same dimensionality" assert data.shape[0] == series_names.shape[ 0], "Series_names and data rows must have same dimensionality" x = np.arange(labels.shape[0]) # the label locations fig, ax = plt.subplots() rectangles = [] for i in range(data.shape[0]): rectangles.append( ax.bar(x - i * width / data.shape[1], data[i, :], width / data.shape[1], label=series_names[i])) # Add some text for labels, title and custom x-axis tick labels, etc. if y_label: ax.set_ylabel(y_label) if title: ax.set_title(title) ax.set_xticks(x) ax.set_xticklabels(labels) ax.legend() def autolabel(rects): for rect in rects: height = rect.get_height() ax.annotate( '{}'.format(height), xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), # 3 points vertical offset textcoords="offset points", ha='center', va='bottom') #for rectangle in rectangles: #autolabel(rectangle) fig.tight_layout() plt.show()
def to_classes(x, n_classes=2, min_value=None, max_value=None, agg_function=None, verbose=False): x = to_array(x) n_classes = int(n_classes) if not min_value: min_value = np.min(x) if not max_value: max_value = np.max(x) if verbose: print(f"Value max: {min_value}") print(f"Value min: {max_value}") if not agg_function: agg_function = np.mean step = (max_value - min_value) / n_classes classes_array = np.zeros((x.shape[0], )) for j in range(n_classes): if verbose: print(f"Interval lower bound: {min_value + j * step}") print(f"Interval upper bound: {min_value + (j + 1) * step}") if j < n_classes - 1: idx = np.where((0 <= x - min_value - j * step) & (x - min_value - j * step < step))[0] else: # if last chunk, take inferior or equal (instead of strictly inferio) to max value # n_steps * steps =/= max_value because of float approximation idx = np.where((0 <= x - min_value - j * step) & (x <= max_value))[0] classes_array[idx] = agg_function(x[idx]) return classes_array
def plot_histo(x, bins=100, title=None): x = to_array(x) fig, ax = plt.subplots() ax.set_title(title) plot.plt(x) plt.show()
def plot_multiscatter(df: pd.DataFrame, features_x, features_y, within_x=False, within_y=False): features_x = to_array(features_x) features_y = to_array(features_y) for x, y in itertools.product(features_x, features_y): plot_scatter(x=df[x], y=df[y]) if within_x: plot_multiscatter(df, features_x, features_x) if within_y: plot_multiscatter(df, features_y, features_y)
def plot_densities(data, labels=None): data = to_ndarray(data) if labels: labels = to_array(labels) assert data.shape[1] == labels.shape[ 0], "Data columsn and labels must have same dimensionality" for j in range(data.shape[1]): sns.distplot(data[:, j], label=labels[j]) plt.legend() plt.show()
def plot_multiline(df: pd.DataFrame, x_column, labels=None, scale=False): if isinstance(x_column, str): x_points = df[x_column] else: x_points = to_array(x_column) if labels: labels = to_array(labels) assert df.shape[1] == labels.shape[ 0], "Data columns and labels must have same dimensionality" if isinstance(df, pd.Series): _df = pd.DataFrame() _df['_'] = df.values df = _df for col in df.columns: if scale: y = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) else: y = df[col] sns.lineplot(x=x_points, y=y) plt.legend() plt.show()
def plot_mirrorline(df, x_column, scale=False): if isinstance(x_column, str): x_points = df[x_column] else: x_points = to_array(x_column) assert df.shape[1] == 2, "Dataframe must have two columns" upper_line = df.columns[0] lower_line = df.columns[1] if scale: for col in [upper_line, lower_line]: df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) # Ensure lines are positive df[upper_line] = df[upper_line] - min(df[upper_line].min(), 0) df[lower_line] = -(df[lower_line] - min(df[lower_line].min(), 0)) sns.lineplot(y=df[upper_line], x=x_points) sns.lineplot(y=df[lower_line], x=x_points) plt.legend() plt.show()