Beispiel #1
0
def plot_bacteria_as_clusters(data: pd.DataFrame,
                              save_path: Path,
                              save_fig: bool = False,
                              time_point=None):
    if time_point is None:
        # set to last time step
        time_point = -1
    position_matrix = []
    for bac in data['position'].index:
        x, y, z = data['position'][bac][time_point][0], \
                  data['position'][bac][time_point][1], \
                  data['position'][bac][time_point][2]
        position_matrix.append([x, y, z])

    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], s=30)
    ax.view_init(azim=200)
    plt.show()

    # model = DBSCAN(eps=2.5, min_samples=2)
    model = OPTICS(min_samples=2, metric='euclidean')
    model.fit_predict(data)

    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=model.labels_, s=30)
    ax.view_init(azim=200)
    plt.show()
    if save_fig:
        path = Path(save_path).parent / 'cluster_plot.png'
        plt.savefig(path)
        plt.close(fig)
    else:
        plt.show()
Beispiel #2
0
    def sort_bacteria_in_cluster(self):
        """:
        Sorts the bacteria in the biofilm into bac_clusters. Clusters are calculated with the OPTICS algorithm.
        Return value is a list of the bac_clusters containing the respective bacteria.

        """
        # sort data in the format of a 3xN matrix where N is the number of bacteria.
        data = self.position_matrix.transpose()

        model = OPTICS(min_samples=2, metric='euclidean')

        model.fit_predict(data)

        clusters = [[] for _ in range(0, len(np.unique(model.labels_)))]
        for bacteria, index in zip(self.bacteria, model.labels_):
            # sort bacteria in bac_clusters according to the assigned labels

            clusters[index].append(bacteria)

        # check if all bacteria where assigned
        sum = 0
        for cluster in clusters:
            sum += len(cluster)
        if sum != len(self.bacteria):
            raise ValueError(f"{abs(sum - len(self.bacteria))} bacteria where not sorted in a cluster.")

        return clusters
Beispiel #3
0
def optics_clustering(principal_components, principal_df):
    final_df = pd.concat([principal_df], axis=1)
    model = OPTICS(eps=5, min_samples=2)
    # fit model and predict clusters
    yhat = model.fit_predict(principal_components)
    # retrieve unique clusters
    clusters = unique(yhat)
    final_df['Segment'] = model.labels_
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    print(final_df)
    plt.title("OPTICS Clustering")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=yhat,
                    n_clusters=len(clusters))
    return final_df
Beispiel #4
0
def find_cluster_indices(output_seqs, batch_size, datatype="train_y"):
    ## Cluster the output set of sequences and chooose sequences randomly from each cluster
    ###
    print("Clustering {}".format(datatype))
    features = convert_to_array(output_seqs)
    from sklearn.cluster import DBSCAN
    clustering_type = OPTICS(min_samples=2, min_cluster_size=2)
    #DBSCAN(eps=0.5, min_samples=2).fit(features) #OPTICS(min_samples=2, min_cluster_size=2)
    cluster_labels = clustering_type.fit_predict(features)
    print("Number of clusters: {}".format(str(len(list(set(cluster_labels))))))
    x = list()
    y = list()
    cluster_indices_dict = dict()
    for i, l in enumerate(cluster_labels):
        x.append(output_seqs[i])
        y.append(l)
        if l not in cluster_indices_dict:
            cluster_indices_dict[l] = list()
        cluster_indices_dict[l].append(i)
    scatter_df = pd.DataFrame(list(zip(x, y)),
                              columns=["output_seqs", "clusters"])
    scatter_df.to_csv(
        "data/generated_files/clustered_output_seqs_data_{}.csv".format(
            datatype))
    return cluster_labels, cluster_indices_dict, scatter_df
Beispiel #5
0
def get_optics(data):
	""" Do optics clustering and return clustered data """
	optics = OPTICS(min_samples=50)
	vals = data.iloc[ :, 0:].values
	y_pred = optics.fit_predict(StandardScaler().fit_transform(vals))
	data["cluster"] = y_pred
	return data
def visual(c, X, y):
    from sklearn.cluster import OPTICS
    cluster_object = OPTICS(min_cluster_size=100)
    y_pred = cluster_object.fit_predict(X)
    colors = [
        'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown',
        'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'
    ]
    clusters = np.unique(y_pred)
    print("Cluster Labels")
    print(clusters)
    print("Evaluation")
    evaluation_labels(y, y_pred)
    evaluation(X, y_pred)
    for cluster in clusters:
        row_idx = np.where(y == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Dataset')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()

    plt.figure()
    for cluster in clusters:
        row_idx = np.where(y_pred == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Cluster')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()
Beispiel #7
0
def cluster_proteins_by_sim(prot_graph_fname):
    print('here')
    with open(prot_graph_fname, 'rb') as fd:
        nodes, adj_mat = pkl.load(fd)

    model = OPTICS(min_cluster_size=5, n_jobs=-1)
    clusters = model.fit_predict(adj_mat)
    print(Counter(clusters))

    transformer = eGTM()
    x, y = transformer.fit_transform(adj_mat).T
    cmap = plt.get_cmap('jet', np.max(clusters) + 2)
    cmap.set_under('gray')

    fig, ax = plt.subplots()
    ax.scatter(x, y, c=clusters, s=10, cmap=cmap)
    outfile = os.path.join(os.path.dirname(prot_graph_fname),
                           'protein_egtm_clusters.png')
    plt.savefig(outfile)
    plt.close()

    transformer = TSNE(n_components=2, n_iter_without_progress=10)
    x, y = transformer.fit_transform(adj_mat).T
    cmap = plt.get_cmap('jet', np.max(clusters) + 2)
    cmap.set_under('gray')

    fig, ax = plt.subplots()
    ax.scatter(x, y, c=clusters, s=10, cmap=cmap)
    outfile = os.path.join(os.path.dirname(prot_graph_fname),
                           'protein_tsne_clusters.png')
    plt.savefig(outfile)
    plt.close()
def exploratory_analysis(dataset: str, samples=0.1, eps=np.inf) -> None:
    X = np.genfromtxt(dataset, delimiter=',', encoding='utf8')
    scaler = StandardScaler(copy=False)
    X_transformed = scaler.fit_transform(X)
    clust = OPTICS(min_samples=samples, max_eps=eps, n_jobs=2)
    labels = clust.fit_predict(X)
    n_clusters = len(set(labels))
    print("# clusters: {0}".format(n_clusters))
Beispiel #9
0
class OPTICS_algo_wrapper:
    def __init__(self):
        self.wrapped = OPTICS(min_samples=1,
                              max_eps=2,
                              metric='cosine',
                              cluster_method='dbscan')

    def fit(self, data):
        return self.wrapped.fit(data)

    def predict(self, data):
        return self.wrapped.fit_predict(data)
Beispiel #10
0
class OPTICS_algo_wrapper:
    def __init__(self):
        self.wrapped = OPTICS(min_samples=5, xi=.05, min_cluster_size=.05)
        self.data = []
        self.indexes = []

    def fit(self, data):
        self.wrapped.fit(data)
        self.data = data
        self.indexes = self.wrapped.labels_

    def predict(self, data):
        return self.wrapped.fit_predict(data)
Beispiel #11
0
def cluster_manifold_in_embedding(hl, y, manifold_learner,umap_min_dist,umap_metric,umap_dim,umap_neighbors,n_clusters,cluster):
    # find manifold on autoencoded embedding
    if manifold_learner == 'UMAP':
        md = float(umap_min_dist)
        hle = umap.UMAP(random_state=0,metric=umap_metric,n_components=umap_dim,n_neighbors=umap_neighbors,min_dist=md).fit_transform(hl)
    elif manifold_learner == 'LLE':
        hle = LocallyLinearEmbedding(n_components=umap_dim,n_neighbors=umap_neighbors).fit_transform(hl)
    elif manifold_learner == 'tSNE':
        hle = TSNE(n_components=umap_dim,n_jobs=16,random_state=0,verbose=0).fit_transform(hl)
    elif manifold_learner == 'isomap':
        hle = Isomap(n_components=umap_dim,n_neighbors=5).fit_transform(hl)

    # clustering on new manifold of autoencoded embedding
    if cluster == 'GMM':
        gmm = mixture.GaussianMixture(covariance_type='full',n_components=n_clusters,random_state=0)
        gmm.fit(hle)
        y_pred_prob = gmm.predict_proba(hle)
        y_pred = y_pred_prob.argmax(1)
    elif cluster == 'KM':
        km = KMeans(init='k-means++',n_clusters=n_clusters,random_state=0,n_init=20)
        y_pred = km.fit_predict(hle)
    elif cluster == 'SC':
        sc = SpectralClustering(n_clusters=n_clusters,random_state=0,affinity='nearest_neighbors')
        y_pred = sc.fit_predict(hle)
        
    elif cluster=='DBSCAN':
        db=DBSCAN()
        y_pred=db.fit_predict(hle)
        
    elif cluster=='OPTICS':
        op=OPTICS()
        y_pred=op.fit_predict(hle)
   

        

    y_pred = np.asarray(y_pred)
    #y = np.asarray(y)
    # y = y.reshape(len(y), )

    #nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    #ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print('=='*80)
    #print("METRICS for the ",cluster,manifold_learner)
    #print(nmi)
    #print(ari)
    print('=' * 80)

  
    return y_pred
class OPTICSModel(ClusteringModel):
    def __init__(self, **params):
        super().__init__('optics')

        self.model = OPTICS(**params)

    def perform_clustering(self, features, **params):
        self.model.fit(features, **params)

        return pd.concat([
            features,
            pd.DataFrame([i for i in self.model.fit_predict(features)],
                         columns=('cluster', ))
        ],
                         axis=1)
Beispiel #13
0
 def _get_class(self, im):
     minmax = (im.min(), im.max())
     if minmax[1] - minmax[0] == 0:
         return np.array()
     im = (im - minmax[0]) / (minmax[1] - minmax[0])
     clf = OPTICS(metric='euclidean', min_cluster_size=75)
     a = []
     for x in range(im.shape[0]):
         for y in range(im.shape[1]):
             if im[x][y] > self.sentence_threshold:
                 a.append([x, y])
     b = clf.fit_predict(a)
     c = np.zeros(im.shape)
     for i in range(len(b)):
         c[a[i][0], a[i][1]] = b[i] + 1
     return c
Beispiel #14
0
    def bin(self,
            min_samples=5,
            max_eps=np.inf,
            metric='euclidean',
            p=2,
            metric_params=None,
            cluster_method='xi',
            eps=None,
            xi=0.05,
            predecessor_correction=True,
            min_cluster_size=None,
            algorithm='auto',
            leaf_size=30,
            n_jobs=10,
            **kwargs):
        """ optics clustering
        """

        # optics clustering
        clusterer = OPTICS(min_samples=min_samples,
                           max_eps=max_eps,
                           metric=metric,
                           p=p,
                           metric_params=metric_params,
                           cluster_method=cluster_method,
                           eps=eps,
                           xi=xi,
                           predecessor_correction=predecessor_correction,
                           min_cluster_size=min_cluster_size,
                           algorithm=algorithm,
                           leaf_size=leaf_size,
                           n_jobs=n_jobs)
        cluster_labels = clusterer.fit_predict(self.embedding_df)
        cluster_df = pd.DataFrame(data=cluster_labels.transpose(),
                                  columns=['cluster'],
                                  index=self.embedding_df.index)
        # write output
        output_cluster_file = os.path.join(self.output_dir,
                                           self.prefix + "_optics.tsv")
        cluster_df.to_csv(output_cluster_file,
                          sep="\t",
                          header=True,
                          index=True)

        return cluster_df
Beispiel #15
0
async def run(hub, config, pipe, data, train):
    '''
    Run the OPTICS algorithm on the given dataset
    '''
    if pipe not in hub.models.optics.COMPS:
        kmconfig = config.get('optics', {})
        mlo = OPTICS(n_jobs=kmconfig.get('n_jobs', -1))
        print('Created OPTICS machine learning object:\n', mlo)
        hub.models.optics.COMPS[pipe] = {'mlo': mlo}

    mlo = hub.models.optics.COMPS[pipe]['mlo']
    if train:
        print(f'Training {len(train)} samples')
        mlo.fit(train)
    if data:
        print(f'Predicting {len(data)} samples')
        return list(mlo.fit_predict(data))
    return []
Beispiel #16
0
def clusters_partition(df, color, size):
    '''Establishes hubs wherever two or more clusters connect.'''
    params = {
        'cluster_method': 'xi',
        'metric': 'cityblock',
        'xi': 0.05,
        'min_cluster_size': None,
        'max_eps': np.inf,
        'n_jobs': None
    }
    model = OPTICS(**params)
    df['hub'] = model.fit_predict(X=df[['x', 'y']].values)
    mask = df['hub'] < 0
    df['hub_group'] = df['hub'] // size
    df.loc[mask, 'hub_group'] = -1
    df['hub_color'] = df['hub'] % size
    df.loc[mask, 'hub_color'] = -1
    df['hub_color'] = df['hub_color'].map(color)
    return df
def optics(data, name_list, data_name, result_path, vis_path):
    print("Start OPTICS clustering..")

    model = OPTICS(min_samples=10)
    model.fit(data)
    k = max(model.labels_)
    predict = model.fit_predict(data)

    result_path = result_path + "optics/"
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    vis_path = vis_path + "optics/"
    if not os.path.exists(vis_path):
        os.mkdir(vis_path)

    #image_classification(name_list, predict, result_path, k)
    save_result(name_list, predict, result_path, k)
    visualization(data, predict, data_name, vis_path, k)
    print("Done.\n")
Beispiel #18
0
 def _get_class(self, im, size=128):
     minmax = (im.min(), im.max())
     if minmax[1]-minmax[0] == 0:
         return np.array([])
     im = (im-minmax[0]) / (minmax[1]-minmax[0])
     sc = cv2.resize(im, (size,size), interpolation=cv2.INTER_NEAREST)
     clf = OPTICS(max_eps=5, metric='euclidean', min_cluster_size=75)
     a = []
     for x in range(sc.shape[0]):
         for y in range(sc.shape[1]):
             if sc[x][y] > 0.01:
                 a.append([x,y])
     b = clf.fit_predict(a)
     p = {v:k for k,v in enumerate(set(b))}
     b = [p[j] for j in b]
     c = np.zeros(sc.shape, dtype=np.int32)
     for i in range(len(b)):
         c[a[i][0],a[i][1]] = b[i]+1
     c = cv2.resize(c, (im.shape[1], im.shape[0]), interpolation=cv2.INTER_NEAREST)
     return c
Beispiel #19
0
def optics_mins(ecfp_data):
    min_s_lst = []
    nn_lst = []
    svm_lst = []
    lda_lst = []
    rf_lst = []
    h_x_lst = []
    for min_s in range(2, 10):
        clustering = OPTICS(min_samples=min_s, metric=tanimoto_dist)
        labels = clustering.fit_predict(ecfp_data)
        X_train, X_test, y_train, y_test = train_test_split(ecfp_data,
                                                            labels,
                                                            test_size=0.2,
                                                            random_state=0)
        min_s_lst.append(min_s)
        nn_lst.append(nn_classification(X_train, X_test, y_train, y_test))
        svm_lst.append(svm_classification(X_train, X_test, y_train, y_test))
        lda_lst.append(lda_classification(X_train, X_test, y_train, y_test))
        rf_lst.append(rf_classification(X_train, X_test, y_train, y_test))
        h_x_lst.append(shannon_entropy(labels))
    fig, ax = plt.subplots()
    ax.plot(min_s_lst, nn_lst, label='NN')
    ax.plot(min_s_lst, svm_lst, label='SVM')
    ax.plot(min_s_lst, lda_lst, label='LDA')
    ax.plot(min_s_lst, rf_lst, label='RF')
    ax.set_xlabel('Minimal Samples')
    ax.set_ylabel('Accuracy Rate')
    ax1 = ax.twinx()
    ax1.plot(min_s_lst, h_x_lst, '--', color='black')
    ax1.set_ylabel('Shannon Entropy')
    ax.set_title("Hyperparameter Tuning for OPTICS Clustering")
    ax.legend(loc='lower right')
    #ax.grid()
    plt.margins(0.02)
    ax.set_ylim([0, 1])
    ax1.set_ylim([0, 1])
    plt.show()
def compute_optics(scooter_data, facil_bndry):
    with open(
            '/Users/BrandonHall/Documents/GitHub/SUMDScrapeAndAnalysis/DC_Outlines/DCboundCoords.pkl',
            'rb') as f:
        facilities = pickle.load(f)
    xext = (-76.00, -77.20)
    yext = (38.7, 39.00)
    X = np.array([[trip['xy'].x, trip['xy'].y] for trip in scooter_data])
    print("Shape of X", X.shape)
    clust = OPTICS(min_samples=50, xi=.005, max_eps=.1, min_cluster_size=.005)
    # Run the fit
    labels = clust.fit_predict(X)
    unique_labels = set(labels)
    print('there are ' + str(len(unique_labels) - 1) + ' clusters')
    # graph clusters
    plt.figure(figsize=(7, 7))
    colors = [
        plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
    ]
    plt.fill(*facil_bndry.exterior.xy, c='gold', alpha=0.3)
    plt.plot(*facil_bndry.exterior.xy)
    plt.plot(*facilities.exterior.xy)
    # G = gridspec.GridSpec(1, 1)
    # ax = plt.subplot(G[0, 0])
    # ax.set_title('Automatic Clustering\nOPTICS')

    for klass, color in zip(range(0, len(unique_labels)), colors):
        Xk = X[clust.labels_ == klass]
        plt.plot(Xk[:, 0], Xk[:, 1], alpha=0.9)
    plt.plot(X[clust.labels_ == -1, 0],
             X[clust.labels_ == -1, 1],
             'k+',
             alpha=0.5)

    plt.axis('equal')
    plt.show()
Beispiel #21
0
# optics clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import OPTICS
from matplotlib import pyplot
# define dataset
X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
# define the model
model = OPTICS(eps=0.8, min_samples=10)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()
Beispiel #22
0
  
  vectors = []
  for word in sentence:
    if word in model:
      vectors.append(model[word])

  df_vectors = pd.DataFrame(vectors)
  # Wortweise Durchschnitt bilden, sodass der ganze Satz einen einzigen "Durchschnitts-Wortvektor" erhält
  mean_vector = df_vectors.mean(axis=0).values.tolist()

  entry_vectors.append(mean_vector)

df['vector'] = entry_vectors

# Clustering
xi = .07
clust = OPTICS(min_samples=2, xi=xi)
labels = clust.fit_predict(entry_vectors)
df['label'] = labels

pd.set_option('display.max_colwidth', -1) # Lange Strings

# Spalten wählen
df = df.filter(items=['label', 'feed', 'entry'])
# Unkategorisierte Zeilen weglassen
df = df[df['label'] >= 0]
# Sortieren
df = df.sort_values(by='label')

print(df.to_string())
# Del NaN and 1
ListVal = [
    x for x in ListVal if not math.isnan(x[0]) and (x[0] != 1 and x[1] != 1)
]
ListKey = [
    y for x, y in zip(ListVal, ListKey)
    if not math.isnan(x[0]) and (x[0] != 1 and x[1] != 1)
]

DictF = {x: y for x, y in zip(ListKey, ListVal)}

################################################################

opt = OPTICS(min_samples=14)
y_opt = opt.fit_predict(ListVal)
ListKeyN = np.array(ListKey)
print(Counter(y_opt))
#print("Noise: {0}".format(ListKeyN[y_opt==-1]))
print("1-th cluster: {0}".format(ListKeyN[y_opt == 20]))
'''
import matplotlib.pyplot as plt
countClust = []
Noise = []
for i in range(8, 109):
    opt = OPTICS(min_samples=i)
    y_opt = opt.fit_predict(ListVal)
    countClust.append(max(Counter(y_opt).keys()))
    Noise.append(Counter(y_opt)[-1])

plt.plot(countClust, range(8, 109), marker='o')
Beispiel #24
0
def cluster(img,
            eps,
            min_samples,
            backend="dbscan",
            nthreads=2,
            fit_kind="circle"):
    """
    Cluster group of pixels.

    Parameters:
    ----------
    img : np.ndarray
        Input image. Must be binary.
    eps : float
        Maximum distance allowed to form a cluster.
    min_samples : int
        Minimum number of samples to form a cluster.
    backend : str
        Which backend to use for clustering. Default is DBSCAN.
    fit_kind : str
        What type of geometry to fir to the clusters. Default is circle.

    Returns:
    -------
    df : pd.DataFrame
        A dataframe with the clustering results.
    """
    ipx, jpx = np.where(img)  # gets where img == 1
    X = np.vstack([ipx, jpx]).T

    if len(X) > min_samples:
        if backend.lower() == "optics":
            db = OPTICS(cluster_method="dbscan",
                        metric="euclidean",
                        eps=eps,
                        max_eps=eps,
                        min_samples=min_samples,
                        min_cluster_size=min_samples,
                        n_jobs=nthreads,
                        algorithm="ball_tree").fit(X)
            labels = db.labels_
        elif backend.lower() == "hdbscan":
            db = hdbscan.HDBSCAN(min_cluster_size=int(min_samples),
                                 metric="euclidean",
                                 allow_single_cluster=True,
                                 core_dist_n_jobs=nthreads)
            labels = db.fit_predict(X)
        elif backend.lower() == "dbscan":
            db = DBSCAN(eps=eps,
                        metric="euclidean",
                        min_samples=min_samples,
                        n_jobs=nthreads,
                        algorithm="ball_tree").fit(X)
            labels = db.labels_
        else:
            raise ValueError("Use either DBSCAN or OPTICS.")

        # to dataframe
        df = pd.DataFrame(X, columns=["j", "i"])
        df["cluster"] = labels
        df = df[df["cluster"] >= 0]

        # get centers and radii
        cluster = []
        i_center = []
        j_center = []
        n_pixels = []
        R1 = []
        R2 = []
        theta = []
        for cl, gdf in df.groupby("cluster"):

            # fit a circle
            if fit_kind == "circle":
                c, r2 = miniball.get_bounding_ball(
                    gdf[["i", "j"]].values.astype(float))
                xc, yc = c
                r1 = np.sqrt(r2)
                r2 = r1  # these are for ellipses only
                t = 0  # these are for ellipses only
            elif fit_kind == "ellipse":
                try:
                    # compute the minmun bounding ellipse
                    A, c = mvee(gdf[["i", "j"]].values.astype(float))
                    # centroid
                    xc, yc = c
                    # radius, angle and eccentricity
                    r1, r2, t, _ = get_ellipse_parameters(A)
                except Exception:
                    # fall back to circle
                    c, r2 = miniball.get_bounding_ball(
                        gdf[["i", "j"]].values.astype(float))
                    xc, yc = c
                    r1 = np.sqrt(r2)
                    r2 = r1  # these are for ellipses only
                    t = 0  # these are for ellipses only
            else:
                raise ValueError("Can only fit data to circles or ellipses.")
            # append to output
            i_center.append(xc)
            j_center.append(yc)
            cluster.append(cl)
            n_pixels.append(len(gdf))
            R1.append(r1)
            R2.append(r2)
            theta.append(t)

        # to dataframe
        x = np.vstack([i_center, j_center, n_pixels, R1, R2, theta, cluster]).T
        columns = ["ic", "jc", "pixels", "ir", "jr", "theta_ij", "cluster"]
        df = pd.DataFrame(x, columns=columns)

        return df

    else:
        return pd.DataFrame()
data['resultados'] = previsoes
data.groupby("resultados").aggregate("mean").plot.bar()
plt.title('Algoritmo: DBSCAN')
plt.legend(['Matemática','Leitura','Escrita'])
plt.xlabel('Classes')
plt.ylabel('Nota média')
plt.show()


plt.hist(data['resultados'])
plt.xlabel('Classes')
plt.ylabel('Quantidade')
plt.show()

clust = OPTICS(min_samples=20, min_cluster_size=15)
previsoes = clust.fit_predict(scores)
unicos, quantidade = np.unique(previsoes, return_counts = True)
print("Optics com Min Samples {0}".format(20))
print("Coeficiente de Silhueta média: %0.3f" % sklearn.metrics.silhouette_score(scores, clust.labels_))
print("Coeficiente de Davies Bouldin: %0.3f" % sklearn.metrics.davies_bouldin_score(scores, clust.labels_))
print("Coeficiente de Calinski Harabasz: %0.3f\n" % sklearn.metrics.calinski_harabasz_score(scores, clust.labels_))
for u, q in zip(unicos, quantidade):
    print("Classe {0}:\t{1} elementos na classe".format(u,q))
data['resultados'] = previsoes
resultadoOPTICS = data['resultados']

data.groupby("resultados").aggregate("mean").plot.bar()
plt.title('Algoritmo: OPTICS')
plt.legend(['Matemática','Leitura','Escrita'])
plt.xlabel('Classes')
plt.ylabel('Nota média')
from sklearn.cluster import DBSCAN
times = []
for i in range(1, 5):
    start = time.time()
    dbscanClustering = DBSCAN(eps=5, min_samples=6).fit_predict(clData)
    end = time.time()
    times.append(end - start)
dbscanTime = average(times)

##############      OPTICS       ##############
from sklearn.cluster import OPTICS
times = []
for i in range(1, 5):
    start = time.time()
    opticsClustering = OPTICS(min_samples=50, xi=0.05, max_eps=10)
    opticsLabels = opticsClustering.fit_predict(clData)
    end = time.time()
    times.append(end - start)
opticsTime = average(times)

###########       Hierarchical Clustering         ##############
from sklearn.cluster import AgglomerativeClustering
times = []
for i in range(1, 5):
    start = time.time()
    HierClustering = AgglomerativeClustering(n_clusters=8).fit_predict(clData)
    end = time.time()
    times.append(end - start)
hierarchicalTime = average(times)

###########       Spectral Clustering        ##############
Beispiel #27
0
# Using the Optics Algorithm
ms = OPTICS(n_jobs=3)
ms.fit(customers_normalized)

customers["Cluster"] = ms.labels_
customers.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': ['mean', 'count']
}).round(2)

# Visualise clusters for Optics Algorithm

# Scatter Plot
y_kmeans = ms.fit_predict(customers_normalized)
plt.figure(figsize=(8, 8))
plt.scatter(
    customers_normalized[y_kmeans == 0, 0],
    customers_normalized[y_kmeans == 0, 1],
    # customers_normalized[y_kmeans == 0, 2],
    s=10,
    c='red',
    label='')
plt.scatter(
    customers_normalized[y_kmeans == 1, 0],
    customers_normalized[y_kmeans == 1, 1],
    # customers_normalized[y_kmeans == 1, 2],
    s=10,
    c='blue',
    label='')
Beispiel #28
0
                palette="Accent").set_title("PCA of kMeans analysis")

print(adjusted_rand_score(kmeansClustering["kMeans"], data["Gate"]))

del (dataCopy, principalComponents, principalDf)

# OPTICS Clustering
#explaination of methods in sklearn documentation

from sklearn.cluster import OPTICS

optics = OPTICS(min_samples=10, xi=.05, min_cluster_size=5)

dataCopy = data.copy()
del (dataCopy["Gate"])
resOptics = optics.fit_predict(dataCopy)
dataCopy["optics"] = resOptics

#sns.pairplot(dataCopy, diag_kind="kde", markers="1", hue = "optics")

dataCopy["optics"].value_counts()

#DBSCAN Clustering

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=121, min_samples=10)

dataCopy = data.copy()
del (dataCopy["Gate"])
Beispiel #29
0
X = array([
    [1038, 660],
    [1045, 680],
    [1038, 750],
    [897, 750],
    [807, 780],
    [805, 850],
])

# Fitting OPTICS model to the dataset
model = OPTICS(
    eps=0.3,
    min_samples=3,
)
model.fit(X)

# See the division of data in clusters 0 and -1
y = model.fit_predict(X)
y

clusters = unique(y)

for cluster in clusters:
    row_ix = where(y == cluster)
    plt.scatter(
        X[row_ix, 0],
        X[row_ix, 1],
    )
plt.show()
Beispiel #30
0
    def smart_group_clashes(self, clash_sets, max_clustering_distance):
        from sklearn.cluster import OPTICS
        from collections import defaultdict

        count_of_input_clashes = 0
        count_of_clash_sets = 0
        count_of_smart_groups = 0
        count_of_final_clash_sets = 0

        count_of_clash_sets = len(clash_sets)

        for clash_set in clash_sets:
            if not "clashes" in clash_set.keys():
                self.settings.logger.info(
                    f"Skipping clash set [{clash_set['name']}] since it contains no clash results."
                )
                continue
            clashes = clash_set["clashes"]
            if len(clashes) == 0:
                self.settings.logger.info(
                    f"Skipping clash set [{clash_set['name']}] since it contains no clash results."
                )
                continue

            count_of_input_clashes += len(clashes)

            positions = []
            for clash in clashes.values():
                positions.append(clash["position"])

            data = np.array(positions)

            # INPUTS
            # set the desired maximum distance between the grouped points
            if max_clustering_distance > 0:
                max_distance_between_grouped_points = max_clustering_distance
            else:
                max_distance_between_grouped_points = 3

            model = OPTICS(min_samples=2,
                           max_eps=max_distance_between_grouped_points)
            model.fit_predict(data)
            pred = model.fit_predict(data)

            # Insert the smart groups into the clashes
            if len(pred) == len(clashes.values()):
                i = 0
                for clash in clashes.values():
                    int_prediction = int(pred[i])
                    if int_prediction == -1:
                        # ungroup this clash since it's a single clash that we were not able to group.
                        new_clash_group_number = np.amax(pred).item() + 1 + i
                        clash["smart_group"] = new_clash_group_number
                    else:
                        clash["smart_group"] = int_prediction
                    i += 1

        # Create JSON with smart_groups that contain GlobalIDs
        output_clash_sets = defaultdict(list)
        for clash_set in clash_sets:
            if not "clashes" in clash_set.keys():
                continue
            smart_groups = defaultdict(list)
            for clash_id, content in clash_set["clashes"].items():
                if "smart_group" in content:
                    object_id_list = list()
                    # Clash has been grouped, let's extract it.
                    object_id_list.append(content["a_global_id"])
                    object_id_list.append(content["b_global_id"])
                    smart_groups[content["smart_group"]].append(object_id_list)
            count_of_smart_groups += len(smart_groups)
            output_clash_sets[clash_set["name"]].append(smart_groups)

        # Rename the clash groups to something more sensible
        for clash_set, smart_groups in output_clash_sets.items():
            clash_set_name = clash_set
            # Only select the clashes that correspond to the actively selected IFC Clash Set
            i = 1
            new_smart_group_name = ""
            for smart_group, global_id_pairs in list(smart_groups[0].items()):
                new_smart_group_name = f"{clash_set_name} - {i}"
                smart_groups[0][new_smart_group_name] = smart_groups[0].pop(
                    smart_group)
                i += 1

        count_of_final_clash_sets = len(output_clash_sets)
        self.settings.logger.info(
            f"Took {count_of_input_clashes} clashes in {count_of_clash_sets} clash sets and turned them into {count_of_smart_groups} smart groups in {count_of_final_clash_sets} clash sets"
        )

        return output_clash_sets