def main():
    img = cv2.imread("IMG_2805.jpg")
    # img = cv2.imread("Picture1.png")
    w = len(img[0])
    h = len(img)
    img = img[::2, ::2]
    number_of_segments = 500
    segments = slic(img, n_segments=number_of_segments, sigma=5)
    # segments = felzenszwalb(img_float, 100, sigma=5, min_size=50)
    number_of_segments = len(np.unique(segments))
    print("SLIC is done", number_of_segments)
    points = np.zeros((number_of_segments, 3))
    for segment in range(number_of_segments):
        segment_mask = segments == segment
        size_of_segment = segment_mask.sum()
        if size_of_segment != 0:
            for col in range(3):
                points[segment, col] = int(
                    (segment_mask * img[:, :, col]).sum() / size_of_segment)
    print("Points are made")
    ms = MeanShift(bandwidth=1)
    ms.fit_predict(points)
    print("Mean Shift done")
    cluster_centers = ms.cluster_centers_
    output = cluster_centers[ms.labels_[segments]]
    print(output)
    cv2.imshow("output", output.astype(np.uint8))
    cv2.imwrite("CartoonedImage.jpg", output.astype(np.uint8))
    cv2.waitKey()
    cv2.destroyAllWindows()
Exemple #2
0
def get_ft_field(zeta_res, model, zeta_scope, mode, ft_fields, meth):

    if mode == 0:
        words = zeta_res.index[zeta_res[zeta_scope] > 0]
    else:
        words = zeta_res.index[zeta_res[zeta_scope] < 0]

    vecs = [model.get_word_vector(str(x)) for x in words]
    word_matrix = np.matrix(vecs)

    if meth == "MS":
        clu = MeanShift(n_jobs=-1)

    if meth == "AP":
        if mode == 0:
            clu = AffinityPropagation(
                preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0])
        else:
            clu = AffinityPropagation(
                preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0])

    if meth == "Birch":
        clu = Birch(n_clusters=None)

    clu.fit_predict(word_matrix)
    try:
        cluster_frame1 = pd.DataFrame(clu.cluster_centers_)
    except:
        cluster_frame1 = pd.DataFrame(clu.subcluster_centers_)

    cluster_frame1["Category"] = mode

    ft_fields.put(cluster_frame1)
    ft_fields.close()
Exemple #3
0
def evaluate_learners(X):
    '''
    Run multiple times with different learners to get an idea of the
    relative performance of each configuration.

    Returns a sequence of tuples containing:
        (title, predicted classes)
    for each learner.
    '''

    from sklearn.cluster import (MeanShift, MiniBatchKMeans,
                                 SpectralClustering, AgglomerativeClustering)

    learner = MeanShift(
        # Let the learner use its own heuristic for determining the
        # number of clusters to create
        bandwidth=None)
    y = learner.fit_predict(X)
    yield 'Mean Shift clusters', y

    learner = MiniBatchKMeans(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'K Means clusters', y

    learner = SpectralClustering(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'Spectral clusters', y

    learner = AgglomerativeClustering(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters (N=2)', y

    learner = AgglomerativeClustering(n_clusters=5)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters (N=5)', y
Exemple #4
0
    def CombinedMeanShift(self, h, alpha,
                          PrincComp=None,
                          njobs=-2,
                          mbf=1):
        """Performs the scikit-learn Mean Shift clustering.

        Arguments:

        h -- the bandwidth
        alpha -- the weight of the principal components as compared
        to the spatial data.
        PrincComp -- used to pass already-computed principal components
        njobs -- the number of processes to be used (default: n. of CPU - 1)
        mbf -- the minimum number of items in a seed"""

        MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True,
                       min_bin_freq=mbf, n_jobs=njobs)
        if PrincComp is None:
            PrincComp = self.ShapePCA(2)
        print("Starting sklearn Mean Shift... ")
        stdout.flush()
        fourvector = np.vstack((self.__data, alpha * PrincComp))
        MS.fit_predict(fourvector.T)
        self.__ClusterID = MS.labels_
        self.__c = MS.cluster_centers_.T
        self.__clsizes = itemfreq(self.__ClusterID)[:, 1]
        print("done.")
        stdout.flush()
    def CombinedMeanShift(self, h, alpha,
                          PrincComp=None,
                          njobs=-2,
                          mbf=1):
        """Performs the scikit-learn Mean Shift clustering.

        Arguments:

        h -- the bandwidth
        alpha -- the weight of the principal components as compared
        to the spatial data.
        PrincComp -- used to pass already-computed principal components
        njobs -- the number of processes to be used (default: n. of CPU - 1)
        mbf -- the minimum number of items in a seed"""

        MS = MeanShift(bin_seeding=True, bandwidth=h, cluster_all=True,
                       min_bin_freq=mbf, n_jobs=njobs)
        if PrincComp is None:
            PrincComp = self.ShapePCA(2)
        print("Starting sklearn Mean Shift... ")
        stdout.flush()
        fourvector = np.vstack((self.__data, alpha * PrincComp))
        MS.fit_predict(fourvector.T)
        self.__ClusterID = MS.labels_
        self.__c = MS.cluster_centers_.T
        print("done.")
        stdout.flush()
def meanshift(data):
    bandwidth = estimate_bandwidth(data)
    if (bandwidth - bandwidth / 2) < 0 and (bandwidth + bandwidth / 2) > 0:
        space = {
            'bandwidth': hp.uniform('bandwidth', 0, bandwidth + bandwidth / 2),
            'min_bin_freq': hp.choice('min_bin_freq', range(1, 30))
        }
    elif (bandwidth + bandwidth / 2) <= 0:
        space = {
            'bandwidth': hp.uniform('bandwidth', 0.1, 1.5),
            'min_bin_freq': hp.choice('min_bin_freq', range(1, 30))
        }
    else:
        space = {
            'bandwidth':
            hp.uniform('bandwidth', bandwidth - bandwidth / 2,
                       bandwidth + bandwidth / 2),
            'min_bin_freq':
            hp.choice('min_bin_freq', range(1, 30))
        }
    algo = partial(tpe.suggest, n_startup_jobs=10)
    if data.shape[0] < 1000:
        best = fmin(hyper_meanshift, space, algo=algo, max_evals=100)
    else:
        best = fmin(hyper_meanshift, space, algo=algo, max_evals=30)
    model = MeanShift(bandwidth=best['bandwidth'],
                      min_bin_freq=int(best['min_bin_freq'] + 1))
    return best, model.fit_predict(data), sil_score(
        data, model.fit_predict(data)), model.fit(data)
Exemple #7
0
def get_w2v_fields(zeta_res, model, zeta_scope, meth):

    ratio = len(zeta_res.index[zeta_res[zeta_scope] > 0]) / len(
        zeta_res.index[zeta_res[zeta_scope] < 0])
    print(ratio)
    words = zeta_res.index[zeta_res[zeta_scope] > 0]
    vecs = []
    for word in words:
        try:
            vecs.append(model[word])
        except:
            pass
    word_matrix = np.matrix(vecs)

    if meth == "MS":
        clu = MeanShift(bandwidth=1, n_jobs=-1)
    if meth == "AP":
        clu = AffinityPropagation(
            preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0])
    if meth == "Birch":
        clu = Birch(n_clusters=None)

    clu.fit_predict(word_matrix)
    try:
        cluster_frame1 = pd.DataFrame(clu.cluster_centers_)
    except:
        cluster_frame1 = pd.DataFrame(clu.subcluster_centers_)

    cluster_frame1["category"] = 0

    words = zeta_res.index[zeta_res[zeta_scope] < 0]
    vecs = []
    for word in words:
        try:
            vecs.append(model[word])
        except:
            pass
    word_matrix = np.matrix(vecs)

    if meth == "MS":
        clu = MeanShift(bandwidth=1, n_jobs=-1)
    if meth == "AP":
        clu = AffinityPropagation(
            preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0])
    if meth == "Birch":
        clu = Birch(n_clusters=None)

    clu.fit_predict(word_matrix)
    try:
        cluster_frame2 = pd.DataFrame(clu.cluster_centers_)
    except:
        cluster_frame2 = pd.DataFrame(clu.subcluster_centers_)

    cluster_frame2["category"] = 1

    cluster_frame = pd.concat([cluster_frame1, cluster_frame2]).reset_index()

    return cluster_frame
    return cluster_frame
Exemple #8
0
 def _run_mean_shift(self, data):
     """Runs the mean shift algorithm on desired dataset."""
     bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=200)
     ms = MeanShift(bandwidth=bandwidth,
                    cluster_all=False,
                    bin_seeding=True)
     ms.fit_predict(data)
     return ms
def runMeanShift(argsdict, data, inlbl, fPath, fName, fileN, i, sampleType):
    start = time.time()
    est = MeanShift(bandwidth=estimate_bandwidth(data, quantile=0.2))

    est.fit_predict(data)
    end = time.time()

    return runRawAnalysis(argsdict, inlbl, est.labels_, fileN + '.Results',
                          fPath + fName + str(i) + '_SIG.csv', (end - start))
Exemple #10
0
def MShift(X):
    # The following bandwidth can be automatically detected using
    #bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=samples/2)

    ms = MeanShift(bandwidth=None, cluster_all=False,
                   bin_seeding=True)  #bandwidth=bandwidth, bin_seeding=True)
    ms.fit_predict(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
def _mean_shift(corpus, labels):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    mean_shift = MeanShift(bandwidth=0.65, bin_seeding=True)
    result_mean_shift = mean_shift.fit_predict(X.toarray())
    print('MeanShift:', normalized_mutual_info_score(result_mean_shift,
                                                     labels))
def cluster(csv):

    data = pd.read_csv(csv)
    # X Features
    X = np.array(data.drop(['botname'], 1))
    #print(X)

    X = scale(X.data)

    # Wähle Anzahl der Cluster, Random State seed für Reproduktion der Ergebnisse
    clustering = MeanShift()

    clustering.fit(X)
    #    print(X_scaled)
    X_scaled = X
    #print(X_scaled)

    result = clustering.fit_predict(X)

    data['Cluster'] = result
    data = data.sort_values(['Cluster'])

    data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\meanshiftresult.csv")
    # Auswertung:
    # Silhouette Score?
    print(silhouette_score(X_scaled, result))
    print(data)
    # CLass Prediction for Trainingsset
    from sklearn.model_selection import train_test_split
    X = np.array(data.drop(['botname'], 1))
    y = data['Cluster']  # Klassen?
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    print(X_test)
    print(y)
Exemple #13
0
def mean_shift_clustering(principal_components, principal_df):
    final_df = pd.concat([principal_df], axis=1)
    model = MeanShift()
    # fit model and predict clusters
    yhat = model.fit_predict(principal_components)
    # retrieve unique clusters
    clusters = unique(yhat)
    final_df['Segment'] = model.labels_
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    print(final_df)
    plt.title("Mean Shift Clustering")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=yhat,
                    n_clusters=len(clusters))
    return final_df
Exemple #14
0
def meanshift_cluster(bandwidth, vectors):
    """ Mean shift clustering. Finds bin centers via sklearns meanshift clustering. """

    t0 = time.time()
    if not bandwidth:
        print('no bandwidth given, will estimate best.')
        mscluster = MeanShift(seeds=None,
                              bin_seeding=False,
                              min_bin_freq=1,
                              cluster_all=True,
                              n_jobs=-1)
    else:

        mscluster = MeanShift(bandwidth=bandwidth,
                              seeds=None,
                              bin_seeding=False,
                              min_bin_freq=1,
                              cluster_all=True,
                              n_jobs=-1)
    assigned_clusters = mscluster.fit_predict(vectors)
    center = mscluster.cluster_centers_
    print('MeanShift Clustering took {:.2f} seconds'.format(time.time() - t0))
    print('Found {} clusters with bandwith = {}'.format(
        len(center), bandwidth))
    return mscluster, assigned_clusters, center
Exemple #15
0
    def run(self, ncpus, steps=None):
        """
        Analyze the full simulation.

        Parameters
        ----------
        ncpus : int
            Number of processors.
        """
        coord_dict = self.get_coords(ncpus, steps)

        estimator = MeanShift(bandwidth=1, n_jobs=ncpus, cluster_all=True)

        self.cluster_dict = {}
        for feature, coords in coord_dict.items():
            results = estimator.fit_predict(coords)
            p_dict = {}
            for cluster in results:
                p_dict = hl.frequency_dict(p_dict, cluster, 1)

            for cluster, frequency in p_dict.items():
                center = estimator.cluster_centers_[cluster]
                c = Cluster(cluster, frequency, center)
                self.cluster_dict = hl.list_dict(self.cluster_dict, feature, c)

        return self.cluster_dict
Exemple #16
0
 def MeanShiftPercentTotal(self):
     '''
     Type: MeanShift
     Y-axis: % Reactions
     X-axis: # Observations
     '''
     if self.authenticated:
         from sklearn.cluster import MeanShift as MS
         algorithm = MS(bandwidth=2)
         categories = algorithm.fit_predict(self.percentTotal)
         plt.scatter(self.percentTotal[categories == 0, 0],
                     self.percentTotal[categories == 0, 1],
                     c="green")
         plt.scatter(self.percentTotal[categories == 1, 0],
                     self.percentTotal[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(
                 txt, (self.percentTotal[i][0], self.percentTotal[i][1]))
         plt.ylabel("PERCENT")
         plt.xlabel("TOTAL")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("MeanShift: # Observations, % Reactions")
         plt.show()
Exemple #17
0
    def __get_stations_clusters_meanshift(self, coords, var):
        clusterer = MeanShift(
            cluster_all=False, bandwidth=var, n_jobs=1
        )  # Faster than multi-threaded, still prevents parallel execution unfortunately...
        pred = clusterer.fit_predict(coords)

        return pred
def clusterMeanShift(ndf):

    df = pd.read_csv(ndf, encoding="ISO-8859-1")
    bandwidth = estimate_bandwidth(df, quantile=0.3)
    clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True)

    reduced_data = PCA(n_components=2).fit_transform(df)
    reduced_data = normalize(reduced_data,
                             norm='l2',
                             axis=1,
                             copy=True,
                             return_norm=False)
    ms = clusters.fit_predict(reduced_data)

    plt.scatter(reduced_data[ms == 0, 0],
                reduced_data[ms == 0, 1],
                s=50,
                c='lightgreen',
                edgecolor='black',
                marker='o',
                label='cluster 1')
    plt.scatter(clusters.cluster_centers_[:, 0],
                clusters.cluster_centers_[:, 1],
                s=80,
                c='red',
                marker='*',
                label='centroides')

    plt.legend()
    plt.grid()
    plt.show()
Exemple #19
0
def customer_clustering():
    data = data_helpers.read_feature_data(file_path='./data/customer_data')
    ms_model = MeanShift(bandwidth=0.18)

    predict_labels = ms_model.fit_predict(data)
    cluster_centers_indices = ms_model.cluster_centers_
    total_guess_num = 800
    correct_guess_num = 0
    predict_label_count_matrix = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                                  [0, 0, 0, 0]]
    for id_, label_ in enumerate(predict_labels):
        if id_ < 200:
            predict_label_count_matrix[0][label_] += 1
        elif id_ < 400:
            predict_label_count_matrix[1][label_] += 1
        elif id_ < 600:
            predict_label_count_matrix[2][label_] += 1
        else:
            predict_label_count_matrix[3][label_] += 1
    for i in range(4):
        correct_guess_num += max(predict_label_count_matrix[i])

    accuracy = float(correct_guess_num) / float(total_guess_num)
    print('accuracy:' + str(accuracy))
    return data, predict_labels
Exemple #20
0
def test():
    from skimage import io
    from sklearn.cluster import MeanShift

    house = io.imread('lab4/images/house.jpg', as_gray=True)
    n_row, n_col = house.shape
    data = house.reshape(-1, 1)  # transform to feature space (1D for grayscale)

    # segmentation using scikit-image library function
    ms = MeanShift(bandwidth=20, bin_seeding=True)
    clusters = ms.fit_predict(data).reshape(-1, 1)

    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    [ax.set_axis_off() for ax in (ax1, ax2)]
    ax1.imshow(house, cmap='gray')
    ax1.set_title('original')
    ax2.imshow(clusters.reshape((n_row, n_col)), cmap='gray')
    ax2.set_title('segmentation')
    f.suptitle(f"scikit-image library function", fontsize=16)
    plt.show()

    # segmentation using our algorithm
    bandwidth = 20
    clusters = mean_shift(house, bandwidth=bandwidth)

    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    [ax.set_axis_off() for ax in (ax1, ax2)]
    ax1.imshow(house, cmap='gray')
    ax1.set_title('original')
    ax2.imshow(clusters.reshape((n_row, n_col)), cmap='gray')
    ax2.set_title('segmentation')
    f.suptitle(f"our algorithm (bandwidth = {bandwidth})", fontsize=16)
    plt.show()
Exemple #21
0
def test_meanshift_predict(global_dtype):
    # Test MeanShift.predict
    ms = MeanShift(bandwidth=1.2)
    X_with_global_dtype = X.astype(global_dtype, copy=False)
    labels = ms.fit_predict(X_with_global_dtype)
    labels2 = ms.predict(X_with_global_dtype)
    assert_array_equal(labels, labels2)
Exemple #22
0
class MeanShiftClustering(ModelBase):
    def __init__(self, X):
        self.bw = self.find_bandwidth(X)
        self.cluster_lables = None
        self.centroid = None
        self.model = MeanShift(bandwidth=self.bw)

    def _reset(self):
        self.bw = None
        self.cluster_lables = None
        self.centroid = None

    def find_bandwidth(self, X):
        return estimate_bandwidth(X, quantile=0.25)

    def fit(self, X):
        self.cluster_labels = self.model.fit_predict(X)
        self.centroid = self.model.cluster_centers_
        dict_ = defaultdict(list)
        for i, v in enumerate(self.cluster_labels):
            dict_[v].append((cdist([self.centroid[v]], [X[i]],
                                   'euclidean')[0][0], i))
        self.near_metric_idx = []
        for i in dict_.keys():
            self.near_metric_idx.append(sorted(dict_[i])[0][1])
        return self

    def get_closest_samples(self, labels):
        return [labels[idx] for idx in self.near_metric_idx]
def user_rating_clustering():
    data = data_helpers.read_feature_data(
        file_path='../data/user_movie_rating')
    ms_model = MeanShift()
    predict_labels = ms_model.fit_predict(data)
    cluster_centers_indices = ms_model.cluster_centers_
    print(predict_labels)
Exemple #24
0
 def cluster_list(self, prob_list, bandwidth=-1):
     if bandwidth <= 0:
         cluster_model = MeanShift()
     else:
         cluster_model = MeanShift(bandwidth=bandwidth)
     label_list = cluster_model.fit_predict(
         np.array(prob_list).reshape(-1, 1))
     group_num = np.max(label_list) + 1
     if group_num == 1:
         return np.mean(prob_list)
     else:
         prob_dict = {}
         for i in range(len(label_list)):
             label = label_list[i]
             if label not in prob_dict:
                 prob_dict[label] = []
             prob_dict[label].append(prob_list[i])
         max_index = -1
         max_prob = -100
         for i in range(len(prob_dict)):
             avg_prob = np.mean(prob_dict[i])
             if avg_prob > max_prob:
                 max_prob = avg_prob
                 max_index = i
         return np.mean(prob_dict[max_index])
Exemple #25
0
def visual(c, X, y):
  from sklearn.cluster import MeanShift
  cluster_object = MeanShift()
  y_pred = cluster_object.fit_predict(X)
  colors = ['red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue']
  clusters = np.unique(y_pred)
  print("Cluster Labels")
  print(clusters)
  print("Evaluation")
  evaluation_labels(y, y_pred)
  evaluation(X, y_pred)
  for cluster in np.unique(y):
    row_idx = np.where(y == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Dataset')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
  for cluster in clusters:
    row_idx = np.where(y_pred == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Clusters')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
Exemple #26
0
 def MeanShiftRatio(self):
     '''
     Type: MeanShift
     Y-axis: No Reaction
     X-axis: Reaction
     '''
     if self.authenticated:
         from sklearn.cluster import MeanShift as MS
         algorithm = MS(bandwidth=2)
         categories = algorithm.fit_predict(self.allCoord)
         plt.scatter(self.allCoord[categories == 0, 0],
                     self.allCoord[categories == 0, 1],
                     c="green")
         plt.scatter(self.allCoord[categories == 1, 0],
                     self.allCoord[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centers_[:, 0],
                     algorithm.cluster_centers_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
         plt.ylabel("NO REACTION")
         plt.xlabel("REACTION")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0])
         plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1])
         plt.title("MeanShift: Reaction, No Reaction")
         plt.show()
def hyper_meanshift(args):
    global data_file
    ms = MeanShift(bandwidth=args['bandwidth'],
                   min_bin_freq=int(args['min_bin_freq']),
                   n_jobs=-1)
    pred = ms.fit_predict(data_file.data)
    temp = sil_score(data_file.data, pred)
    return -temp
def hyper_meanshift(args):
    global basic_data
    global all_data
    ms = MeanShift(bandwidth = args['bandwidth'],min_bin_freq = int(args['min_bin_freq']))
    pred = ms.fit_predict(basic_data)
    temp = sil_score(all_data,pred)
    # print(args)
    return -temp
Exemple #29
0
def rgbMeanShiftImageSeg(img, num_clusters, defaultColors, difThres):
    data = img.reshape(img.shape[0] * img.shape[1], 3)
    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)
    mShift = MeanShift(bandwidth=bandwidth, bin_seeding=True)

    labels = mShift.fit_predict(data)
    img_labels = labels.reshape(img.shape[:2])
    centers = mShift.cluster_centers_
    return img_labels, centers
Exemple #30
0
def meanshift(data, k, right_labels):
    bandwidth = estimate_bandwidth(data)
    model = MeanShift(bandwidth=bandwidth,
                      bin_seeding=True,
                      min_bin_freq=k,
                      n_jobs=-1)
    labels = model.fit_predict(data)
    adjusted_rand_score = accuracy(labels, right_labels)
    return labels, adjusted_rand_score
def mean_shift_other(df, target_columns):
    mean_shi = MeanShift()

    feats = target_columns
    y = mean_shi.fit_predict(df[feats])

    df['cluster'] = y

    return df
def main():
    """Load image, collect pixels, cluster, create segment images, plot."""
    # load image
    img_rgb = data.coffee()
    img_rgb = misc.imresize(img_rgb, (256, 256)) / 255.0
    img = color.rgb2hsv(img_rgb)
    height, width, channels = img.shape
    print("Image shape is: ", img.shape)

    # collect pixels as tuples of (r, g, b, y, x)
    print("Collecting pixels...")
    pixels = []
    for y in range(height):
        for x in range(width):
            pixel = img[y, x, ...]
            pixels.append([pixel[0], pixel[1], pixel[2], (y / height) * 2.0, (x / width) * 2.0])
    pixels = np.array(pixels)
    print("Found %d pixels to cluster" % (len(pixels)))

    # cluster the pixels using mean shift
    print("Clustering...")
    bandwidth = estimate_bandwidth(pixels, quantile=0.05, n_samples=500)
    clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    labels = clusterer.fit_predict(pixels)

    # process labels generated during clustering
    labels_unique = set(labels)
    labels_counts = [(lu, len([l for l in labels if l == lu])) for lu in labels_unique]
    labels_unique = sorted(list(labels_unique), key=lambda l: labels_counts[l], reverse=True)
    nb_clusters = len(labels_unique)
    print("Found %d clusters" % (nb_clusters))
    print(labels.shape)

    print("Creating images of segments...")
    img_segments = [np.copy(img_rgb) * 0.25 for label in labels_unique]

    for y in range(height):
        for x in range(width):
            pixel_idx = (y * width) + x
            label = labels[pixel_idx]
            img_segments[label][y, x, 0] = 1.0

    print("Plotting...")
    images = [img_rgb]
    titles = ["Image"]
    for i in range(min(8, nb_clusters)):
        images.append(img_segments[i])
        titles.append("Segment %d" % (i))

    plot_images(images, titles)
Exemple #33
0
def evaluate_learners(X):
    '''
    Run multiple times with different learners to get an idea of the
    relative performance of each configuration.

    Returns a sequence of tuples containing:
        (title, predicted classes)
    for each learner.
    '''

    from sklearn.cluster import (MeanShift, MiniBatchKMeans,
                                 SpectralClustering, AgglomerativeClustering)

    learner = MeanShift(
        # Let the learner use its own heuristic for determining the
        # number of clusters to create
        bandwidth=None
    )
    y = learner.fit_predict(X)
    yield 'Mean Shift clusters', y

    learner = MiniBatchKMeans(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'K Means clusters', y

    learner = SpectralClustering(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'Spectral clusters', y

    learner = AgglomerativeClustering(n_clusters=2)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters (N=2)', y

    learner = AgglomerativeClustering(n_clusters=5)
    y = learner.fit_predict(X)
    yield 'Agglomerative clusters (N=5)', y
Exemple #34
0
   def obtainClusters(self, hist):

      print 'Obatining clusters using MeanShift from skilean...'
      
      hist = np.array(hist)
      hist = hist.astype(float)      
      scaled_vec = StandardScaler().fit_transform(hist)
      
      bandwidth = estimate_bandwidth(scaled_vec, quantile=0.3)
      ms = MEANSHIFT(bandwidth=bandwidth, bin_seeding=True)

      clusters = ms.fit_predict(scaled_vec)

      print 'Clusters obtained using MeanShift'
      
      return clusters
def mean_shift(data,metric):
    t0 = time()
    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(data))
    model = MeanShift(cluster_all=True)
    labels = model.fit_predict(data)

    if np.count_nonzero(labels) != 0:
        score = accuracy.getAccuracy(data,labels,len(data),metric)
    else:
        score = 'None'

    t1 = time()

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    return ('Mean Shift',n_clusters_,score,t1-t0)
Exemple #36
0
def compute_clusters():
    '''
    Calculates the centroid centers based on the reports
    on the database.
    '''
    data = Report.objects.all().values('latitude', 'longitude', 'category')
    X = np.array([np.array([d['latitude'], d['longitude']]) for d in data])

    model = MeanShift(bandwidth=settings.THRESHOLD)

    # Getting metrics for each cluster
    labels = model.fit_predict(X)
    categories = [d['category'] for d in data]
    label_metrics = zip(labels, categories)

    clusters = zip(list(set(model.labels_)), model.cluster_centers_)

    _update_clusters(clusters, label_metrics)
def mean_shift():
    """
    MeanShift discovers blobs in a smooth density of samples. It is a centroid
    algorithm which works by updating candidates for centroids to be the mean
    of the positions within a given region. These candidates are then filtered
    in a post-processing stage to eliminate near-duplicates and form the final
    list of centroids.
    """
    # Set a generic data sample.
    centers = [ [-1.,0.], [0.,1.], [1.,0.] ]
    n_samples = 3000
    std = 0.5
    seed = 0
    data, target = make_blobs(n_samples = n_samples, centers = centers,
        random_state = seed, cluster_std = std)

    # Set bandwidth for the mean shift classifier.
    width = estimate_bandwidth(data, quantile = 0.2,
        n_samples = int(n_samples / 5))
    # Setup the classifier.
    clf = MeanShift(bandwidth = width, bin_seeding = True)
    ms_y = clf.fit_predict(data)

    # Evaluate accuracy.
    cnt = int(0)
    for idx in range(n_samples):
        if(ms_y[idx] != clf.labels_[idx]): cnt += 1
    acc = float(cnt) / float(n_samples)

    # Print results.
    print('Approximated number of centroids ', len(clf.cluster_centers_))
    print('Accuracy ', acc)

    # Plot clusters.
    plt.figure(figsize = (8,8))
    plt.scatter(data[:,0], data[:,1], c = ms_y, s = 30)
    plt.title('Clusters found with the Mean-shift method')
    plt.show()
Exemple #38
0
def predictMeanShift(X, labels):
	# The following bandwidth can be automatically detected using
	bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)

	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	results = ms.fit_predict(X)
	print list(results)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_

	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)

	print("number of estimated clusters : %d" % n_clusters_)
	# Create a PCA model.
	pca_2 = PCA(2)
	# Fit the PCA model on the numeric columns from earlier.
	plot_columns = pca_2.fit_transform(X)
	# Make a scatter plot of each game, shaded according to cluster assignment.
	plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=results)
	plt.title("Mean Shift- 4 clusters")
	# Show the plot.
	plt.show()
    featMatrixSTD=StandardScaler().fit_transform(featMatrix)
    featMatrixSTD=featMatrixSTD#+np.abs(featMatrixSTD.min())+1.e-15
    print(featMatrixSTD.min())
    #featMatrix=RobustScaler(with_centering=False).fit_transform(featMatrix)

    nmfTrf=TruncatedSVD(n_components=10)
    nmfFeats=nmfTrf.fit_transform(featMatrixSTD)
    dfTest=paDataFrame(featMatrixSTD[:,:10])

    corr=np.dot(featMatrix,featMatrix.T)
    print(corr.shape)

    bandwidth = estimate_bandwidth(featMatrix, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth*0.7, bin_seeding=True)
    print('bandwidth',bandwidth)
    labels=ms.fit_predict(featMatrix)


    # db = DBSCAN(eps=0.2, min_samples=10,metric='precomputed')
    # dMat=1.-corr
    # labels=db.fit_predict(dMat)
    print(np.unique(labels))
    sorted_labels=np.argsort(labels)
    print(sorted_labels)
    corrSorted=corr[sorted_labels,:]
    corrSorted=corrSorted[:,sorted_labels]
    print(corr.shape,corrSorted.shape)


    lab1=np.where(labels==1)[0]
    lab2=np.where(labels==2)[0]
Exemple #40
0
        scenes = get_joly_scenes_sementation(frames, nb_std_above_mean_th=2.)#get_scenes_segmentation(diffs, nb_std_above_mean_th=2.5)
        del frames #not take to much memory too long for nothing...
        scenes_hashes = [get_hash_of_hashes(L[s:e]) for s, e in scenes]
        
        #tqdm.write(pformat(Counter(scenes_hashes)))

        distance_matrix = np.zeros([len(scenes_hashes)] * 2)
        #compute distance between scenes' hashes
        for i in trange(len(scenes_hashes)):
            for j in range(len(scenes_hashes)):
                distance_matrix[i, j] = hamming(scenes_hashes[i], scenes_hashes[j])
        #find similar scenes which have hases distance too close compared to others
        similar_scenes_matrix = distance_matrix < 64 - (distance_matrix.mean() + distance_matrix.std() * 3)
        #try to automatically found clusters with affinity propagation
        cluster_builder = MeanShift(bandwidth=1)
        scenes_clusters = cluster_builder.fit_predict(similar_scenes_matrix)
        #find the clusters with 'too much' points inside compared to others
        clusters_counter = Counter(scenes_clusters)
        clusters_freq = np.array(list(clusters_counter.values()))
        clusters_freq_th = clusters_freq.mean() + clusters_freq.std() * 2.5
        frequent_clusters_id = list(filter(lambda k: clusters_counter[k] > clusters_freq_th, clusters_counter))
        #find hashes corresponding to these clusters
        scenes_hashes_idx = np.array(list(map(lambda v: v in frequent_clusters_id, scenes_clusters)))
        generics_scenes_hashes = np.array(scenes_hashes, dtype=np.uint64)[scenes_hashes_idx]
        #get the generics indexes from the scene hashes
        generics_scenes_idx = []
        for i, h in enumerate(scenes_hashes):
            if h in generics_scenes_hashes:
                generics_scenes_idx.append(i)
        #get the boundaries of gnerics scenes
        generics_scenes = list(map(lambda i: scenes[i], generics_scenes_idx))
Exemple #41
0
def mean_shift_clustering(features, labels):
  model = MeanShift()
  predictions = model.fit_predict(features)
  
  print get_impurity(predictions, labels)
  plot_clustering(features, labels, predictions)
Exemple #42
0
print result

index = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

for i in range(0,len(index)) :
	print("quantile : %f"%index[i])
	bandwidth = estimate_bandwidth(data, quantile=index[1], n_samples=len(data))

	print ("bandwidth : %f"% bandwidth)

	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
#print ms
	ms.fit(data)
	print ms.fit(data)

	labels = ms.fit_predict(data)

#	for i in range(0, len(labels)):
#		if labels[i] == 0 :
#			labels[i] = 1
#		else :
#			labels[i] = 2
	
	print ("labels : ",labels)

	cluster_centers = ms.cluster_centers_
#	print ("cluster_centers : ", cluster_centers)

	labels_unique = np.unique(labels)

#	print("labels_unique : ", labels_unique)
map_sizes

# <codecell>

from sklearn.cluster import MeanShift


cluster_data = DataFrame(columns = ['Patient ID', 'Visit Number', 'TFName', 'Start', 'Cluster'])

for tf, num in zip(tf_counts.index, tf_counts.values):
    
    data = tf_grouped.ix[tf].reset_index()
    data['TFName'] = tf
    clust = MeanShift(bandwidth = 10)
    res = clust.fit_predict(data[['Start']].values)
    data['Cluster'] = res
    cluster_data = concat([cluster_data, data], axis = 0, ignore_index = True)



# <codecell>

res = crosstab(rows = [cluster_data['Patient ID'], cluster_data['Visit Number']], cols = [cluster_data['TFName'], cluster_data['Cluster']])

# <codecell>

from sklearn.cluster import k_means, mean_shift

centroids, labels = mean_shift(res.values)
def test_meanshift_predict():
    """Test MeanShift.predict"""
    ms = MeanShift(bandwidth=1.2)
    labels = ms.fit_predict(X)
    labels2 = ms.predict(X)
    assert_array_equal(labels, labels2)
Exemple #45
0
Html_file = open("clustering_files/meanshift.html", "w")

# consider only 10000 data (meanshift complexity):
ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool)
ind = shuffle(ind)
data_thr10 = pd.DataFrame(X[ind])
data_thr10.columns = data.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

X = X[ind]


km = MeanShift(cluster_all=False)
preds = km.fit_predict(X)
preds[preds == -1] = max(preds) + 1

print "components", set(preds)
print np.bincount(preds)

data_thr10['preds'] = pd.Series(preds).astype("category")
color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"] * 2

title = str(np.bincount(preds))
TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan"
plot_width = 900
plot_height = 300
x_name = 'rateCA'
y_name = 'rate'