コード例 #1
0
def find_eps(data, k, metric):
    nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data)
    distances, indices = nbrs.kneighbors(data)
    distanceDec = sorted(distances[:, k - 1], reverse=True)
    knee = KneeLocator(indices[20:500, 0],
                       distanceDec[20:500],
                       direction="decreasing",
                       curve="convex")
    knee.plot_knee_normalized()
    return distanceDec[knee.elbow]
コード例 #2
0
ファイル: utils.py プロジェクト: BartekKrzepkowski/SUS
def find_eps(data, k, metric):
    END = round(data.shape[0] * 0.9)
    nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(data)
    distances, indices = nbrs.kneighbors(data)
    distanceDec = np.array(sorted(distances[:, k - 1], reverse=True))
    knee = KneeLocator(indices[:END, 0],
                       distanceDec[:END],
                       curve="convex",
                       direction="decreasing",
                       S=1.0)
    knee.plot_knee_normalized()
    return distanceDec[knee.elbow], knee
コード例 #3
0
def estimate_epsilon(X_embedded, sensitivity, plot=False):
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(X_embedded)
    distances, indices = nbrs.kneighbors(X_embedded)
    distances = np.sort(distances, axis=0)
    distances = distances[int(distances.shape[0] / 2):, 1]
    i = np.arange(distances.shape[0])

    # get the elbow
    kneedle = KneeLocator(i,
                          distances,
                          S=1.0,
                          curve='convex',
                          direction='increasing',
                          interp_method='polynomial')

    if plot:
        plt.figure()
        plt.plot(distances)
        kneedle.plot_knee_normalized()

    return sensitivity * distances[kneedle.knee]
コード例 #4
0
def singleDistributionTest(path_in='./data',
                           path_out='./outputs',
                           adjusted_pvalue=False,
                           plot_all=False,
                           plot_legend=False,
                           num_fractions=10,
                           min_fraction=0.1):
    """
    Parameters:
    ----------
    path_in : str
        folder path with input data in .csv format, './data', by default
    path_out : str
        folder path with output data images, './otputs' by default 
    adjusted_pvalue : bool
        size adjusted p-value, False by default
    plot_all : bool
        plot all graphs, if False plot mean value only
    num_fractions : integer
        number of interations to create subsets, 10 by default
    min_fraction : float
        minimal size of subset, 0.1 by default
    """

    fractions = np.linspace(1.0, min_fraction, num=num_fractions)

    mypath = path_in
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    datafiles = [
        f for f in listdir(mypath) if isfile(join(mypath, f))
        if f.endswith('.csv')
    ]

    print(datafiles)

    dfs = []

    for f in datafiles:

        df = pd.read_csv(mypath + '/' + f)
        df = df.dropna()
        df = df.select_dtypes(include=numerics)

        dfs.append(df)

    for idx, df in enumerate(dfs):

        p_adjust = 1 / (np.sqrt(((len(df) + len(df)) / (len(df) * len(df)))))

        if len(df) < 5000:
            alpha = 0.05 / (np.sqrt(
                ((len(df) + len(df)) / (len(df) * len(df)))))
        else:
            alpha = 1.037

        stats_fractions = []
        pvals_fractions = []
        ks_vals = []
        pvals_adjusted_fractions = []

        for f in fractions:
            df_frac = df.sample(frac=f)

            stats = []
            pvals = []
            for c in df.columns:
                kst = ks_2samp(df[c].values, df_frac[c].values, mode='asymp')
                stats.append(kst[0])
                pvals.append(1 - kst[1])

            pvals_adj = (0.05 * (np.sqrt(
                ((len(df_frac) + len(df)) / (len(df_frac) * len(df))))) *
                         p_adjust)

            stats_fractions.append(stats)
            pvals_fractions.append(pvals)
            ks_val = alpha * (np.sqrt(
                ((len(df_frac) + len(df)) / (len(df_frac) * len(df)))))
            ks_vals.append(ks_val)
            pvals_adjusted_fractions.append(pvals_adj)

        print(pvals_adjusted_fractions)

        stats_fractions = np.asarray(stats_fractions)
        pvals_fractions = np.asarray(pvals_fractions)

        if plot_all:

            for i, v in enumerate(df.columns):
                plt.plot(stats_fractions[:, i])
                plt.xticks(range(10), [np.round(f, 1) for f in fractions])
                #plt.hlines(0.05, colors='r', linestyles='dashed', xmin=0.0, xmax=8.0)
                plt.title(datafiles[0])
                if len(df.columns) < 10:
                    plt.legend(df.columns)
            plt.plot(ks_vals, color='r', linestyle='dotted')
            plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats all' +
                        '.pdf',
                        bbox_inches='tight')
            plt.show()

            for i, v in enumerate(df.columns):
                plt.plot(pvals_fractions[:, i])
                plt.xticks(range(10), [np.round(f, 1) for f in fractions])
                #plt.yscale('log')
                plt.title(datafiles[0])
                if len(df.columns) < 10:
                    plt.legend(df.columns)
            if adjusted_pvalue:
                plt.plot(pvals_adjusted_fractions,
                         colors='r',
                         linestyles='dotted')
            else:
                plt.hlines(0.05,
                           colors='r',
                           linestyles='dotted',
                           xmin=0.0,
                           xmax=9.0)
            plt.savefig(path_out + '/' + datafiles[idx] + ' pvals all' +
                        '.pdf',
                        bbox_inches='tight')
            plt.show()

        stats_mean = np.mean(stats_fractions, axis=1)
        pvals_mean = np.mean(pvals_fractions, axis=1)

        plt.plot(stats_mean)
        plt.plot(ks_vals, color='r', linestyle='dotted')
        plt.xticks(range(10), [np.round(f, 1) for f in fractions])
        plt.title(datafiles[0] + ' KS stats mean')
        plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats mean' +
                    '.pdf',
                    bbox_inches='tight')
        plt.show()

        kneedle = KneeLocator(stats_mean,
                              range(10),
                              S=1.0,
                              curve='convex',
                              direction='increasing')
        kneedle.plot_knee_normalized()
        plt.show()

        plt.plot(pvals_mean)
        plt.xticks(range(10), [np.round(f, 1) for f in fractions])
        plt.title(datafiles[0] + ' p-values mean')
        if adjusted_pvalue:
            plt.plot(pvals_adjusted_fractions, colors='r', linestyles='dotted')
        else:
            plt.hlines(0.05,
                       colors='r',
                       linestyles='dotted',
                       xmin=0.0,
                       xmax=9.0)
        plt.savefig(path_out + '/' + datafiles[idx] + ' KS stats mean' +
                    '.pdf',
                    bbox_inches='tight')
        plt.show()

        kneedle = KneeLocator(pvals_mean,
                              range(10),
                              S=1.0,
                              curve='convex',
                              direction='increasing')
        kneedle.plot_knee_normalized()
        plt.show()
コード例 #5
0
data = pd.read_csv("combat_adjusted_minimal.csv", index_col=0)

# calculate variance
variance = data.var(axis=1).sort_values(ascending=False)

# x and y values
x = np.array(list(range(len(variance))))
y = variance.values

# Elbow finder
kneedle = KneeLocator(x, y, S=50, curve='convex', direction='decreasing')

kneedle = KneeLocator(x, y, S=2, curve='convex', direction='decreasing')

# Plot
kneedle.plot_knee_normalized()
sns.set_style("white")
kneedle.plot_knee()

# Print results
kneedle.elbow
kneedle.knee_y

# Subset the genes
mvg = variance[0:kneedle.elbow].index
mvg = data.loc[mvg, :]

# write out the data
mvg.to_csv("mvg_knee.csv")

##############################################################################
コード例 #6
0
print("The corresponding Within-Cluster-Sum of Squared Errors (WSS):",
      kneedle.knee_y)

# %%
# Plot Number of clusters against Within-Cluster-Sum of Squared Errors
kneedle.plot_knee(figsize=plt_cfg.figsize)
plt.xlabel("Number of clusters")
plt.ylabel("Within-Cluster-Sum of Squared Errors")
plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1))
plt.tight_layout()
plt.savefig("results/knee.png")
plt.show()

# %%
# Plot the normalized knee curves
kneedle.plot_knee_normalized(figsize=plt_cfg.figsize)
plt.tight_layout()
plt.savefig("results/knee_normalized.png")
plt.show()

# %% [markdown]
# ### The Silhouette Method
#
# The silhouette value measures how similar a point is to its own cluster (cohesion) compared to other clusters (separation).

# %%
plt.subplots(figsize=plt_cfg.figsize)
ax = sns.lineplot(x="n_clusters", y="mean_sil_coeff", data=km_stat)
ax.set(xlabel="Number of clusters", ylabel="Mean Silhouette Coefficient")
plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1))
plt.tight_layout()
コード例 #7
0
from scipy.interpolate import interp1d

with open("sse_minibatch.json", "r") as f:
    sse_ = json.load(f)

n_clusters = sorted([int(k) for k in sse_.keys()])
sse = {int(k): v for k, v in sse_.items()}
y = [sse[k] for k in n_clusters]
x = n_clusters
# print(x)
# f = interp1d(x, y)
# x_new = np.arange(10, max(n_clusters)+1, 5)
# print(x_new)
# y_new = f(x_new)
# plt.plot(x, y, 'o', x_new, y_new, '-')
# plt.savefig("interp1d.png")
# slope = get_1st_deriviatives(sse)
# for i, j in zip(x_new, y_new):
#     print(i,j)

# # # plt.style.use('fivethirtyeight')
kneedle = KneeLocator(x, y, S=1.0, curve='convex', direction='decreasing', online=True, interp_method="polynomial")
print(kneedle.knee)
print(kneedle.knee_y)
plt.style.use('fivethirtyeight')
kneedle.plot_knee(figsize=(18, 7))
plt.savefig("knee.png")

kneedle.plot_knee_normalized(figsize=(18, 7))
plt.savefig("knee_normal.png")