Beispiel #1
0
 def test_scaling(self):
     r = Rotor()
     r.fit_rotate(sample_inc, scale=True, theta=0)
     self.assertLessEqual(r._data[:, 0].max(), 1.00000001)
     self.assertLessEqual(r._data[:, 1].max(), 1.00000001)
     self.assertGreaterEqual(r._data[:, 0].min(), -0.00000001)
     self.assertGreaterEqual(r._data[:, 1].min(), -0.00000001)
def optimizeEps(group, rep, fig=None):
    """
	Special thanks to Cory Maklin
	https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
	
	and also stackexchange user georg-un for the kneebow package:
	https://datascience.stackexchange.com/questions/57122/in-elbow-curve-how-to-find-the-point-from-where-the-curve-starts-to-rise
	"""
    X = group[["ae1", "ae2"]].to_numpy()
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(X)
    dist, idx = nbrs.kneighbors(X)

    dist = np.sort(dist, axis=0)
    d = dist[:, 1]
    dist[:, 0] = idx[:, 0]
    #print(dist)
    #if fig is not None:
    #ax=fig.add_subplot(10,10,rep)
    #ax.plot(d)
    #plt.show()

    rotor = Rotor()
    rotor.fit_rotate(dist)
    elbow_index = rotor.get_elbow_index()
    #ax.axhline(dist[elbow_index][1])
    return (dist[elbow_index][1])
Beispiel #3
0
 def test_rotation(self):
     data = np.array([[0, 0], [1, 0], [0, 1]])
     r = Rotor()
     r.fit_rotate(data, scale=False, theta=np.radians(45))
     self.assertAlmostEqual(r._data[0].tolist()[0], 0, delta=0.01)
     self.assertAlmostEqual(r._data[0].tolist()[1], 0, delta=0.01)
     self.assertAlmostEqual(r._data[1].tolist()[0], 0.71, delta=0.01)
     self.assertAlmostEqual(r._data[1].tolist()[1], -0.71, delta=0.01)
     self.assertAlmostEqual(r._data[2].tolist()[0], 0.71, delta=0.01)
     self.assertAlmostEqual(r._data[2].tolist()[1], 0.71, delta=0.01)
Beispiel #4
0
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"):
    """Cluster data using DBSCAN.

    The the density-based spatial cluster `sklearn.cluster.DBSCAN
    <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_
    to cluster the data. If not provided by users, the distance cutoff `eps` is determined
    by the 'Knee method' which finds the distance at which a sharp change happens.

    Parameters
    ----------
    data : ndarray, shape=(n_samples, n_dims)
    eps : None or scalar, default=None
    min_samples : None or scalar, default=None
    metric : string or callable, default=’euclidean’
        The metric to use when calculating distance between instances in a feature array. If metric
        is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances`
        for its metric parameter.
    plot_dist_curve : bool, default=True

    Returns
    -------
    labels : array_like, shape=(n_samples)
    core_sample_indices : array_like, shape=(n_core_samples)

    """
    if len(data) <= len(data[0]):
        return np.array([0 for dummy in data
                         ]), np.arange(len(data))[np.newaxis, :]
    if eps is None:
        nearest_neighbors = NearestNeighbors(n_neighbors=3)
        nearest_neighbors.fit(data)
        distances, indices = nearest_neighbors.kneighbors(data)
        distances = np.sort(distances, axis=0)[:, 1]
        data_vstacked = np.vstack([np.arange(len(distances)), distances]).T
        rotor = Rotor()
        rotor.fit_rotate(data_vstacked)
        elbow_index = rotor.get_elbow_index()
        eps = distances[elbow_index]
    if min_samples is None:
        scores = []
        for n_sample in np.arange(2, len(data) - 1, 2):
            dbscan = DBSCAN(eps=eps, min_samples=n_sample, metric=metric)
            dbscan.fit(data)
            labels = dbscan.labels_
            if np.all(labels == -1):
                break
            else:
                scores.append(silhouette_score(data, labels))
        min_samples = np.arange(
            2,
            len(data) - 1,
            2)[np.argmax(scores)]  # the highest silhouette_score.
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(data)
    return dbscan.labels_, dbscan.core_sample_indices_
Beispiel #5
0
def get_elbow_index(couplings_dict, plot_elbow=False):
    """ find index at which coupling strengths starts to dramatically decrease """
    y = list(couplings_dict.values())
    x = list(range(len(y)))
    if plot_elbow:
        plt.figure()
        plt.plot(x, y)
        plt.show()
    y.reverse()
    data = np.array([[xi, yi] for xi, yi in zip(x, y)])
    rotor = Rotor()
    rotor.fit_rotate(data)
    elbow_idx = rotor.get_elbow_index()
    return len(y) - elbow_idx
Beispiel #6
0
def get_elbow_no(df, column_name='E-value'):
    # Sorting E-values
    df_evalues = df.sort_values(by=[column_name])

    # Using kneebow to find knee
    y = list(df_evalues[column_name])
    x = list(range(len(y)))
    data = np.array([[xi, yi] for xi, yi in zip(x, y)])

    rotor = Rotor()
    rotor.fit_rotate(data)
    elbow_idx = rotor.get_elbow_index()
    elbow_no = list(df_evalues['No'])[elbow_idx]
    return elbow_no
def findeps(data):
	d = StandardScaler().fit_transform(data)
	d = np.nan_to_num(data)
	neighbors = NearestNeighbors(n_neighbors=2).fit(d)
	distances, indices = neighbors.kneighbors(d)
	distances = np.sort(distances, axis=0)
	distances = distances[:,1]
	plt.plot(distances)
	plt.show()
	rotor = Rotor()
	rotor.fit_rotate(np.concatenate((indices[:,0].reshape(-1, 1), distances.reshape(-1, 1)), axis = 1))
	epsx = rotor.get_elbow_index()
	eps = distances[epsx]
	return eps
Beispiel #8
0
def get_eps(X, neigh=2):
    eps_dist = np.sort(calculate_kn_distance(X, neigh=neigh))
    plt.hist(eps_dist, bins=60)
    plt.ylabel('n')
    plt.xlabel('Epsilon distance')
    plt.show()

    rotor = Rotor()
    curve_xy = np.concatenate(
        [np.arange(eps_dist.shape[0]).reshape(-1, 1),
         eps_dist.reshape(-1, 1)], 1)
    rotor.fit_rotate(curve_xy)
    rotor.plot_elbow()
    e_idx = rotor.get_elbow_index()

    return curve_xy[e_idx]
Beispiel #9
0
            labels = elbow_agglo_params(Z, elbow_save)
           
            csv_path = join(folder, 'labels.csv')
            save_labels(labels, csv_path)

def dbscan_knee(X, save_path, metric):
    print('p1')
    neigh = NearestNeighbors(n_neighbors=3, metric=metric) 
    neighbors = neigh.fit(X) 
    print('p2')
    distances, indices = neighbors.kneighbors(X)  
    distances = np.sort(distances, axis=0) 
    distances = distances[:,1] 
    
    print('hello before Rotor')
    rotor = Rotor()
    data = np.hstack((np.array(range(df.shape[0])).reshape(-1, 1), distances.reshape(-1, 1)))
    rotor.fit_rotate(data)
    eps = distances[rotor.get_elbow_index()]
    print('p3')
    plt.plot(distances)
    plt.title(f'eps={eps}')
    plt.savefig(f'{save_path}/knee_{metric}.jpg')
    plt.close()
    
    return eps


# DBSCAN (min_points = 3, 4, 5)

save_pref = [join(save_folder, sample[sample.rfind('/')+1:]) for sample in samples]
Beispiel #10
0
distances, indices = nbrs.kneighbors(reducedDataSet)

# In[4]:

distances = np.sort(distances, axis=0)
distances = distances[:, 1]
distancebis = savgol_filter(distances, 151, 5)
plt.figure(0)
plt.plot(distances)
plt.figure(1)
plt.plot(distancebis)

# compute second derivative
smooth_d1 = np.gradient(distancebis)
smooth_d2 = np.gradient(np.gradient(distancebis))
rotor = Rotor()
new = np.zeros((0, 2))
for i in range(0, 346):
    array = np.array([[i, distancebis[i]]])
    new = np.append(new, array, axis=0)
rotor.fit_rotate(new)
elbow_index = rotor.get_elbow_index()
#print("yop :" +str(elbow_index))
print(new[elbow_index])

plt.figure(2)
plt.plot(smooth_d2)
"""
infls = np.where(np.diff(np.sign(smooth_d2 )))[0]
optiepsiIndex = np.where(smooth_d2 == np.amax(smooth_d2))[0]
optiepsi = distancebis[optiepsiIndex]
# plt.ylabel('WCSS')
# plt.show()

# determines how well each object lies within its cluster
# The location of the maximum is considered as the appropriate number of clusters.
k_silhouette = np.argmax(silhouette_avg_scores)+3

# visualization
# plt.plot(range(3, 20), silhouette_avg_scores)
# plt.title('Average silhouette method')
# plt.xlabel('Number of clusters')
# plt.ylabel('Average silhouette')
# plt.show()

# get elbow of wcss
rotor = Rotor()
rotor.fit_rotate(new_wcss)
k_wcss = rotor.get_elbow_index()+3

# lest get average of the 2 scares for the K
k = int(np.floor((k_wcss+k_silhouette/2)))

# now lets do the same thing but with the libraries
kmeans = KMeans(n_clusters=k)
kmeans.fit(feature_cordinates[['X', 'Y']])
labels = kmeans.predict(feature_cordinates[['X', 'Y']])
centroids = kmeans.cluster_centers_

# lets save to a new json
feature_cordinates['cluster'] = labels
feature_cordinates.to_json(output_file_name)
Beispiel #12
0
 def test_fit_rotate_params(self):
     r = Rotor()
     r.fit_rotate(sample_inc, scale=False, theta=0.7)
     self.assertFalse(r._scale)
     self.assertEqual(r._theta, 0.7)
Beispiel #13
0
 def test_fit_rotate(self):
     r = Rotor()
     r.fit_rotate(sample_inc)
     self.assertIsNotNone(r._data)
Beispiel #14
0
 def test_fit_rotate_default_parameter(self):
     r = Rotor()
     r.fit_rotate(sample_inc)
     self.assertTrue(r._scale)
     self.assertIsNotNone(r._theta)
Beispiel #15
0
 def test_initialization(self):
     r = Rotor()
     self.assertIsNotNone(r)
Beispiel #16
0
zn = (z - z.min()) / (z.max() - z.min())
xyz_nn = np.vstack([xn,yn,zn]).T

#1rst evaluation
# Nearest neighbors to find the optimal epsilon (maximum distance) https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
nbrs = NearestNeighbors(n_neighbors = 5, algorithm = 'kd_tree').fit(xyz_nn) #['auto', 'ball_tree', 'kd_tree', 'brute']
distances, indices = nbrs.kneighbors(xyz_nn) #the indices of the nearest neighbors 
distances = np.sort(distances, axis=0)
distances = distances[:,4]
plt.plot(distances)

y = np.array(distances)
x = np.linspace(0,len(x),len(x))
xy = np.vstack((x,y)).T

rotor = Rotor()
rotor.fit_rotate(xy)
elbow_idx = rotor.get_elbow_index()
rotor.plot_elbow()
eps = distances[elbow_idx]/2
del x,y,xy

clustering = DBSCAN( algorithm = 'kd_tree',eps=eps, min_samples=5).fit(xyz_nn) #the number of samples is D+1=4
labels = clustering.labels_

colors = [int(i % 23) for i in labels] # 554 labels to 23 distinguished colors

v = pptk.viewer(data,colors)
v.set(point_size=0.01)

# matplotlib
Beispiel #17
0
 def test_detect_elbow(self):
     r = Rotor()
     r.fit_rotate(sample_inc)
     self.assertAlmostEqual(r.get_elbow_index(), 11, delta=1)
Beispiel #18
0
 def test_detect_knee(self):
     r = Rotor()
     r.fit_rotate(sample_dec)
     self.assertAlmostEqual(r.get_knee_index(), 7, delta=1)
Beispiel #19
0
def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"):
    r"""Cluster data using DBSCAN.

    This function clusters the samples using a density-based cluster
    `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit.
    DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples`
    points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are
    mutually density-connected and density-reachable, i.e. there is a path
    :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance
    :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon`
    determine the performance of this cluster.

    If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the
    knee of the k-distance plot.

    Parameters
    ----------
    data : numpy.ndarray, shape=(n_samples, n_dims)
        Sample data to find clusters.

    eps : None or scalar, default=None
        The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is
        not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to
        choose appropriately for your data set and distance function. If None, it is set as the value at the
        knee of the k-distance plot.

    min_samples : None or scalar, default=None
        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This
        includes the point itself. If None, it takes the value of 2 * n_dims

    metric : string or callable, default=’euclidean’
        The metric to use when calculating distance between instances in a feature array. If metric
        is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances`
        for its metric parameter.

    Returns
    -------
    labels : array_like, shape=(n_samples,)
        Cluster labels for each data point.

    core_sample_indices : array_like, shape=(n_clusters,)
        Indices of core samples.

    """
    if len(data) <= len(data[0]):
        return np.array([0 for dummy in data
                         ]), np.arange(len(data))[np.newaxis, :]
    if 2 * len(data[0]) > len(data):
        min_samples = np.min([len(data[0]), 4])
    elif len(data) < 1000:
        min_samples = np.min([2 * len(data[0]), len(data)])
    elif len(data) >= 1000:
        min_samples = np.min([5 * len(data[0]), len(data)])
    if eps is None:
        nearest_neighbors = NearestNeighbors(n_neighbors=min_samples)
        nearest_neighbors.fit(data)
        distances, indices = nearest_neighbors.kneighbors(data)
        distances = np.sort(distances, axis=0)[:, 1]
        data_vstacked = np.vstack([np.arange(len(distances)), distances]).T
        rotor = Rotor()
        rotor.fit_rotate(data_vstacked)
        elbow_index = rotor.get_elbow_index()
        eps = distances[elbow_index]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(data)
    core_sample_indices = [[] for label in np.unique(dbscan.labels_)
                           if label != -1]
    for core_sample_index in dbscan.core_sample_indices_:
        core_sample_indices[dbscan.labels_[core_sample_index]].append(
            core_sample_index)
    return dbscan.labels_, core_sample_indices