Exemple #1
0
def mean_shift(model_data, prediction_data=None):
    t0 = time()
    ms = MeanShift().fit(model_data)
    if prediction_data == None:
        labels = ms.predict(model_data)
    else:
        labels = ms.predict(prediction_data)
    means = ms.cluster_centers_
    print "Number of Means:", means.shape[0]
    print "Mean Shift Time: %0.3f" % (time() - t0)
    return labels, means
Exemple #2
0
def mean_shift(model_data, prediction_data = None):
    t0 = time()
    ms = MeanShift().fit(model_data)
    if prediction_data == None:
        labels = ms.predict(model_data)
    else:
        labels = ms.predict(prediction_data)
    means = ms.cluster_centers_
    print "Number of Means:", means.shape[0] 
    print "Mean Shift Time: %0.3f" % (time() - t0)
    return labels, means    
def meanshift(df):
    from sklearn.cluster import MeanShift
    meanshift = MeanShift()
    meanshift.fit(X)
    labels = meanshift.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    # Predict the cluster for all the samples
    P = meanshift.predict(X)
    pca = PCA(n_components=2, random_state=40)
    reduced_features = pca.fit_transform(features)
    plt.scatter(reduced_features[:,0], reduced_features[:,1], c=meanshift.predict(features))
    plt.show()
class TMeanshiftClus(Discretize):
    def __init__(self, bandwidth):

        self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10)

    def fit_transform(self, created_at):
        indi = pd.DatetimeIndex(created_at)
        lts = indi.values.astype(np.int64)

        dates = [disc_time_to_sec_day(ts) for ts in lts]

        dates = np.array(dates)
        self.ms.fit(dates)

        return ["temp_" + str(centroid) for centroid in list(self.ms.labels_)]

    def transform(self, created_at):
        indi = pd.DatetimeIndex(created_at)
        lts = indi.values.astype(np.int64)

        dates = [disc_time_to_sec_day(ts) for ts in lts]

        return [
            "temp_" + str(centroid)
            for centroid in list(self.ms.predict(dates))
        ]
Exemple #5
0
 def _select(X, bandwidth=None, min_bin_freq=1):
     min_ = min(x[0] for x in X)
     max_ = max(x[0] for x in X)
     if min_ == max_:
         return [min_, min_ + 1]
     if bandwidth is None:
         bandwidth = estimate_bandwidth(X, quantile=0.1)
     ms = MeanShift(bandwidth=bandwidth,
                    bin_seeding=True,
                    min_bin_freq=min_bin_freq)
     try:
         ms.fit(X)
         split_points = {}
         for x in X:
             label = ms.predict([x])[0]
             val = x[0]
             if label not in split_points:
                 split_points[label] = val
             else:
                 split_points[label] = min(val, split_points[label])
         sp = list(split_points.values())
     except:
         sp = [min_]
     sp += [max_ + 1]
     return sorted(sp)
Exemple #6
0
def mean_shift(x_train, y_train, x_test, y_test, range_bandwidth):
    for n_bandwidth in range_bandwidth:
        ms = MeanShift(bandwidth=n_bandwidth)
        ms.fit(x_train, y_train)
        y_pred = ms.predict(x_test)
        print('mean shift n_bandwidth = {}, f1_score = {}'.format(
            n_bandwidth, str(f1_score(y_test, y_pred, average='micro'))))
 def __call__(self, data, n):
     bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)
     ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
     ms.fit(data)
     y_pred = ms.predict(data)
     clusters = {i: np.where(y_pred == i)[0] for i in np.unique(y_pred)}
     return clusters
Exemple #8
0
def mean_shift(im):
    tmp = im.shape
    im = im.reshape((-1, 3))
    bandwidth = estimate_bandwidth(im, quantile=0.1, n_samples=1500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(im)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print "number of estimated cluster :%d" % n_clusters_

    imNew = np.zeros(im.shape)
    l = ms.predict(im)

    area = np.zeros((n_clusters_, 1))
    cnt = 0
    for i in range(len(l)):
        imNew[i] = cluster_centers[l[i]]
        area[l[i]] += 1

    imNew = imNew.reshape(tmp)
    area = area * 1.0 / area.sum() * 100
    #node_labels = zip(cluster_centers , area)
    scipy.misc.imsave('outfile.jpg', imNew)

    labels = labels.reshape((-1, tmp[1]))
    return labels, area, cluster_centers, ms, imNew
def cluster(features):
    model = MeanShift().fit(features)
    meanshift_labels = model.predict(features)
    print(meanshift_labels)
    np.save('meanshift_labels.npy', meanshift_labels)
    with open('meanshift.pkl', 'wb') as f:
        pickle.dump(model, f)
Exemple #10
0
def cropImage(image):
    croppedImages = []
    img = image.copy()
    height, width = img.shape[:2]
    sf = float(height) / float(11675)

    histogram = pd.Series([
        height - cv2.countNonZero(img[:, i]) for i in list(range(width))
    ]).rolling(5).mean()
    ax = histogram.plot()
    #ax.set_ylim([0,200])
    plt.savefig('histogram.pdf', bbox_inches='tight')
    dip_df = histogram[histogram < sf * 150].to_frame().rename(
        columns={0: 'count'})
    dip_df.loc[dip_df['count'] < sf * 25, 'count'] = 0
    indices = np.array(dip_df.index.tolist()).reshape(-1, 1)
    ms = MeanShift()
    ms.fit(indices)
    dip_group = ms.predict(indices)
    dip_df = dip_df.assign(group=dip_group)
    cut_points = [0] + sorted(
        dip_df.groupby('group').apply(
            lambda x: max(x[x['count'] == 0].index)).tolist())[1:-1] + [width]
    for i in list(range(len(cut_points) - 1)):
        croppedImages.append(img[0:height, cut_points[i]:cut_points[i + 1]])
    return croppedImages
Exemple #11
0
def extract_texts(
    blocks_dict: Dict[int,
                      List[TextBlockInfo]]) -> Tuple[List[str], List[int]]:
    """
    Reconstructs texts from each group of text blocks; computes lines in each group
    :param blocks_dict: result returned by calling sift_ocr
    :return: tuple of (texts, lines) for each group
    """
    texts = []
    # How many lines are in each group
    lines = []
    # Start from group 1, since group 0 is every group combined
    for grp in range(1, len(blocks_dict)):
        blocks = blocks_dict[grp]
        # Mean-shift cluster text blocks to normalize rows
        model = MeanShift(bandwidth=5)
        model.fit(np.array([x.bounds.y for x in blocks]).reshape(-1, 1))
        centers = model.cluster_centers_
        lines.append(len(centers))
        # Sort by x, then by y, then by x to reconstruct texts in original order
        blocks.sort(key=lambda x:
                    (centers[model.predict([[x.bounds.y]])[0]][0], x.bounds.x))
        words = [x.text for x in blocks]
        separator = ' '
        sent = separator.join(words).lower()
        texts.append(sent)
    return texts, lines
def clusterMeanshift(placeDb):

    # Because Scikit lib need pure array data as meanshift input,
    # We need to (1)extract Dict to List (2)Run meanshift (3)Use predict to direct back to Dict
    # (1)
    coordList = []
    for node in placeDb:
        coordList.append((node['x'], node['y']))

    # (2)
    X = coordList
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    if bandwidth <= 0:
        bandwidth = 100
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    #print("number of estimated clusters : %d" % n_clusters_)
    #print(cluster_centers)

    # (3)
    #print(ms.predict(X))
    for i in range(0, len(placeDb)):
        belong_cluster_id = ms.predict(X)[i]
        (placeDb[i])['cluster_id'] = belong_cluster_id

    placeDb.sort(key=lambda s: s[
        'y'])  # Not nessary, I hope there is also sort longitude in cluster
    placeDb.sort(key=lambda s: s['cluster_id'])
Exemple #13
0
def test_meanshift_predict(global_dtype):
    # Test MeanShift.predict
    ms = MeanShift(bandwidth=1.2)
    X_with_global_dtype = X.astype(global_dtype, copy=False)
    labels = ms.fit_predict(X_with_global_dtype)
    labels2 = ms.predict(X_with_global_dtype)
    assert_array_equal(labels, labels2)
Exemple #14
0
def getMS_repx_data(data_x, data_y):
    pca_x = PCA_mars.getPcaComponent(data_x, n_components=0.9)

    old_x_train, old_x_test, old_y_train, old_y_test = train_test_split(
        data_x, data_y, test_size=0.3, random_state=0, shuffle=False)
    # #############################################################################
    # Compute clustering with MeanShift
    x_train, x_test, y_train, y_test = train_test_split(pca_x,
                                                        data_y,
                                                        test_size=0.3,
                                                        random_state=0,
                                                        shuffle=False)
    # The following bandwidth can be automatically detected using
    bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
    ms.fit(x_train)
    predict = ms.predict(x_test)
    labels = ms.labels_
    global error_number
    error_number = labels[labels != 0].size + predict[predict != 0].size
    #替换出现训练集处出现特别聚类的X值
    deal_train_x = replace_Cluster(old_x_train, labels)

    deal_test_x = replace_Cluster(old_x_test, predict)

    return (deal_train_x, deal_test_x, old_y_train, old_y_test)
class MSSelector:
    def __init__(self, traces, bandwidth=None, min_bin_freq=None):
        min_bin_freq = min_bin_freq or traces.count() * 0.01
        self.traces = traces
        self.ms = MeanShift(bandwidth=bandwidth,
                            bin_seeding=True,
                            min_bin_freq=min_bin_freq)

    def select(self, col):
        it = map(itemgetter(col), self.traces.select(col).collect())
        X = np.fromiter(it, float).reshape(-1, 1)

        self.ms.fit(X)
        split_points = {}
        for x in X:
            label = self.ms.predict([x])[0]
            val = x[0]
            if label not in split_points:
                split_points[label] = val
            else:
                split_points[label] = min(val, split_points[label])
        max_ = self.traces.select(col).rdd.max()[0]
        sp = list(split_points.values())
        sp += [max_ + 1]
        return sorted(sp)

    def select_foreach(self, cols):
        return {c: self.select(c) for c in cols}
def cluster(X, number_cluster, bandwidth=None, alg="kmeans"):
    X = X.astype(np.float32)
    if alg == "kmeans":
        y_pred = KMeans(n_clusters=number_cluster,
                        random_state=random_state).fit_predict(X)

    elif alg == "spectral":
        y_pred = SpectralClustering(n_clusters=number_cluster,
                                    random_state=random_state,
                                    n_jobs=10).fit_predict(X)

    elif alg == "meanshift":
        # There is a little insight here, the number of neighbors are somewhat
        # dependent on the number of neighbors used in the dynamic graph network.
        if bandwidth:
            pass
        else:
            bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000)
        seeds = X[np.random.choice(np.arange(X.shape[0]), 5000)]
        # y_pred = MeanShift(bandwidth=bandwidth).fit_predict(X)
        clustering = MeanShift(bandwidth=bandwidth, seeds=seeds,
                               n_jobs=32).fit(X)
        y_pred = clustering.predict(X)

    if alg == "meanshift":
        return y_pred, clustering.cluster_centers_, bandwidth
    else:
        return y_pred
def get_cluster_assignments(data):
    meanshift = MeanShift(bin_seeding=True).fit(data)
    labels = meanshift.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    P = meanshift.predict(data)
    return P
def assign_variants_to_clonal_cluster(vafs, ids):
    
    #mark which clusters returned by the clustering are clonal by iterating over clusters by mean vaf 
    #and returning the clusters once we have at least minClonalMuts mutations accumulated
    def assign_clonal_subclonal_clusters(clusterDf, minClonalMuts = 10):
        l = []
        for cluster in set(df['cluster']):
            clusterDf = df[df['cluster'] == cluster]
            l.append((np.nanmean(clusterDf['vaf']), clusterDf.shape[0], cluster))
        runningMutSum = 0
        clonalClusters = []
        for meanVaf, nMut, cluster in sorted(l, reverse=True):
            clonalClusters.append(cluster)
            runningMutSum += nMut
            if runningMutSum >= minClonalMuts:
                return clonalClusters
    
    a = np.array(vafs).reshape(-1, 1)
    clustering = MeanShift().fit(a)
    prediction = clustering.predict(a)
    
    #We make a dataframe 
    listOfDicts = []
    la = list(a)
    lp = list(prediction)
    for i in range(0, len(list(a))):
        listOfDicts.append({
            'vaf': la[i], 'cluster': lp[i], 'varUuid': ids[i]
        })
    df = pd.DataFrame(listOfDicts)
    
    minCMut = max(.1*df.shape[0], 10) #at least 10% of mutation in every case are called clonal
    clonalClusters = assign_clonal_subclonal_clusters(df, minClonalMuts = minCMut)
    df['clonal'] = df['cluster'].apply(lambda x: True if x in clonalClusters else False)
    return dict(zip(df['varUuid'], df['clonal']))
Exemple #19
0
def meanshift(data, bandwidth, min_bin_freq):
    # metric_list = ['euclidean', 'manhattan', 'chebyshev']
    db = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, n_jobs=-1)
    db.fit(data)
    pred = db.predict(data)
    score = sil_score(data, pred)
    print(score)
    return db, pred, score
def ClusterDetection(df_preprocessed):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(df_preprocessed)

    MS = MeanShift()
    MS.fit(reduced_data)

    labels = MS.predict(reduced_data)
    return labels
Exemple #21
0
def cropImage(image, file, do_plots):
    croppedImages = []
    img = image.copy()
    height, width = img.shape[:2]
    sf = float(height) / 11675.0
    sfw = float(width) / 7820.0

    # list of rolling means of black pixels
    histogram = pd.Series([
        height - cv2.countNonZero(img[:, i]) for i in list(range(width))
    ]).rolling(5, center=True).mean()

    # prints out plots of the pixel count histogram and a smoothed version of the histogram
    if do_plots:
        fig = plt.figure()
        ax = histogram.plot()
        ax.set_ylim([0, 200])
        fig.savefig(file.partition('.png')[0] + '.histogram.pdf',
                    bbox_inches='tight')
        plt.close(fig)
        fig = plt.figure()
        ax = histogram.rolling(50, center=True).mean().rolling(
            10, center=True).mean().plot()
        ax.set_ylim([0, 200])
        fig.savefig(file.partition('.png')[0] + '.histogram.smooth.pdf',
                    bbox_inches='tight')
        plt.close(fig)

    # takes all instances where black pixel count < 150
    dip_df = histogram[histogram < sf * 150].to_frame().rename(
        columns={0: 'count'})

    # sets all instances of just 50 (factored to scale) to 0.
    dip_df.loc[dip_df['count'] < sf * 50, 'count'] = 0
    histogram.iloc[0] = 0
    indices = np.array(dip_df.index.tolist()).reshape(-1, 1)

    # predicts the best place to cut the columns
    ms = MeanShift()
    ms.fit(indices)
    dip_group = ms.predict(indices)
    dip_df = dip_df.assign(group=dip_group)

    # picks the rightmost place to cut the columns. might not work if image is tilted.
    try:
        cut_points = [0] + sorted(
            dip_df.groupby('group').apply(lambda x: max(x[x[
                'count'] == 0].index - int(sfw * 35.0))).tolist())[1:-1] + [
                    width
                ]
    except:
        cut_points = [0]

    # returns points to cut.
    for i in list(range(len(cut_points) - 1)):
        croppedImages.append(img[0:height, cut_points[i]:cut_points[i + 1]])
    return croppedImages
Exemple #22
0
class mean_shift_algo_wrapper:
    def __init__(self):
        self.wrapped = MeanShift()

    def fit(self, data):
        return self.wrapped.fit(data)

    def predict(self, data):
        return self.wrapped.predict(data)
Exemple #23
0
def meanshiftt(data):

    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=10)
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    
    ms.fit(data)
    idx = ms.predict(data);
    ctrs = ms.cluster_centers_
    return idx, ctrs
Exemple #24
0
    def _test_mean_shift(self, bandwidth=None, backend="torch", extra_config={}):
        for cluster_all in [True, False]:
            model = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            X = np.array(X, dtype=np.float32)

            model.fit(X)
            torch_model = hummingbird.ml.convert(model, backend, X, extra_config=extra_config)
            self.assertTrue(torch_model is not None)
            np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)
Exemple #25
0
def clustering_mean_shift(data_res, b):
    """
    Executes the mean shift model from sklearn
    """
    ms = MeanShift(bandwidth=b)
    ms.fit(data_res)

    predictions = ms.predict(data_res)
    cluster_centers = ms.cluster_centers_

    return predictions, cluster_centers
Exemple #26
0
 def get_final_ans(self, X_test_proba, h = 0.3):
     ans = np.zeros(X_test_proba.shape[0])
     for i, pred in enumerate(X_test_proba.values):
         ms = MeanShift(bandwidth=h)
         sam = pred.reshape(-1, 1)
         ms.fit(sam)
         a = ms.predict(sam)
         unique, counts = np.unique(a, return_counts=True)
         cluster = unique[np.where(counts == counts.max())[0][0]]
         ans[i] = pred[a == cluster].mean()
     return ans
         
def evaluate_learners(trainData, testData):
    '''
    Run multiple times with different learners to get an idea of the
    relative performance of each configuration.

    Returns a sequence of tuples containing:
        (title, predicted classes)
    for each learner.
    '''

    from sklearn.cluster import (MeanShift, MiniBatchKMeans,
                                 SpectralClustering, AgglomerativeClustering)

    learner = MeanShift(
        # Let the learner use its own heuristic for determining the
        # number of clusters to create
        bandwidth=None)
    y = learner.fit_predict(trainData)
    yield 'Mean Shift clusters train', y, 0
    y = learner.predict(testData)
    yield 'Mean Shift clusters test', y, 1

    learner = MiniBatchKMeans(n_clusters=2)
    y = learner.fit_predict(trainData)
    yield 'K Means clusters train', y, 0
    y = learner.predict(testData)
    yield 'K Means clusters test', y, 1

    learner = SpectralClustering(n_clusters=2)
    y = learner.fit_predict(trainData)
    yield 'Spectral clusters train', y, 0

    learner = AgglomerativeClustering(n_clusters=2)
    y = learner.fit_predict(trainData)
    yield 'Agglo clusters (N=2) train', y, 0

    learner = AgglomerativeClustering(n_clusters=5)
    y = learner.fit_predict(trainData)
    yield 'Agglo clusters (N=5) train', y, 0
Exemple #28
0
 def inner_band_ratio(self):
     '''inner_band_ratio returns the energy band given data'''
     from sklearn.neighbors import NearestNeighbors
     from sklearn.cluster import MeanShift, estimate_bandwidth
     N, a = 30, np.zeros(self.data.shape)
     for i in range(len(self.data)):
         a[i] += 1
     energy_band = [a[i % 5] * i for i in range(N)]
     energy_band = np.asarray(energy_band).reshape((1, -1))
     bandwidth = estimate_bandwidth(energy_band, quantile=0.1)
     ms = MeanShift(bandwidth=bandwidth + 0.1)
     ms.fit(energy_band)
     ys = ms.predict(energy_band + 0.2)
     return np.median(ys + 0.3)
Exemple #29
0
class MeanshiftClus(Clus):
    def __init__(self, pd, bandwidth, kernel_bandwidth):
        super(MeanshiftClus, self).__init__(pd)
        self.kernel_bandwidth = kernel_bandwidth
        self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10)

    def fit(self, X):
        X = np.array(X)
        self.ms.fit(X)
        self.centroids = self.ms.cluster_centers_
        self.nbrs.fit(self.centroids)

    def predict(self, x):
        return self.ms.predict([x])[0]
def clustering(emb):
    temp = scaler.fit_transform(emb)
    Y = TSNE(n_components=2).fit_transform(temp)
    cluster_ms = MeanShift(bandwidth=3, max_iter='200',
                           cluster_all=False).fit(Y)
    y_ms = cluster_ms.predict(Y)

    plt.figure
    plt.scatter(Y[:, 0], Y[:, 1], c=y_ms, s=50, cmap='viridis')
    #centers = kmeans.cluster_centers_
    #plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=1)
    plt.show()

    return y_ms
Exemple #31
0
def detect_text_meanShift(file_names, image_path):
    for name in tqdm(file_names):
        imageNameFile = image_path + "/" + name
        image = cv.imread(imageNameFile)
        image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
        imageArray = np.reshape(image, (-1, 1))
        clustering = MeanShift(bandwidth=3).fit(imageArray)

        print(clustering.labels_)

        imageLabels = clustering.predict(imageArray)

        # thr2 = cv.resize(thr2,None, fx=0.5, fy=0.5)
        cv.imshow('blackthr', image)
        cv.waitKey()
class TMeanshiftClus(Discretize):
    def __init__(self, bandwidth):

        self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10)

    def fit_transform(self, X):
        X = np.array(X)
        self.ms.fit(X)
        print(len(list(self.ms.labels_)))
        return ["coord_" + str(centroid) for centroid in list(self.ms.labels_)]

    def transform(self, x):
        return [
            "coord_" + str(centroid) for centroid in list(self.ms.predict(x))
        ]
def executaMeanShift2(cluster_atual, similaridade, kmer):
    # print("#### MeanShift Rodando...")
    x = []
    # print ("Cluster atual contém: ", len(cluster_atual.cluster))
    for k in cluster_atual.cluster:
        if (k != cluster_atual.centroid):
            aux = np.array(k.histo.T[1:2][0])
            for i in range(len(aux)):
                if ([i, aux[i]] not in x):
                    x.append([i, aux[i]])

    aux = cluster_atual.centroid.histo.T[1:2][0]
    p = []
    for i in range(len(aux)):
        p.append([i, aux[i]])

    ms = MeanShift(bandwidth=2, bin_seeding=True)
    ms.fit(x)
    predit = np.array([ms.predict(p)])

    centroid_novo = cluster_atual.centroid
    x = cluster_atual.centroid.histo.T[1:2][0]
    p = predit[0]
    aux = []
    for i in range(len(x)):
        if (x[i] == 0):
            aux.append(0)
        else:
            aux.append(p[i])

    maior = return_intersection(
        np.array(aux)[0], cluster_atual.centroid.histo.T[1:2][0])
    for k in cluster_atual.cluster:
        x = k.histo.T[1:2][0]

        aux = []
        for i in range(len(x)):
            if (x[i] == 0):
                aux.append(0)
            else:
                aux.append(p[i])
        x = k.histo.T[1:2]

        r = return_intersection(np.array(aux)[0], x[0])
        if (r > maior):
            centroid_novo = k  # Atualizando o centroid
            maior = r
    return centroid_novo
Exemple #34
0
def pipeline(chunks, directory, chunks_file_name, chunks_centers_file_name, n_clus, bw):
    """
    Main pipeline for the first phase of data mining.
    Chunks clustering.
    """
    # calculate the proportion of events
    chunks = calc_proportions(chunks)

    print 'Clustering first model...'
    first_model = KMeans(n_clusters=n_clus, n_jobs=8)
    first_model.fit(chunks.ix[:,15:25])
    centers = first_model.cluster_centers_
    
    print 'Clustering second model...'    
    second_model = MeanShift(bandwidth=bw)
    second_model.fit(centers)
    print "Final number of clusters of chunks with MeanShift: " + str(len(second_model.cluster_centers_))
    
    chunks['label'] = second_model.predict(chunks.ix[:,15:25])
    
    centers = DataFrame(second_model.cluster_centers_, columns= TIME_SERIES_NAMES)
    centers.to_csv(directory + chunks_centers_file_name, index=False)

    chunks.to_csv(directory + chunks_file_name, index=False)
def test_meanshift_predict():
    """Test MeanShift.predict"""
    ms = MeanShift(bandwidth=1.2)
    labels = ms.fit_predict(X)
    labels2 = ms.predict(X)
    assert_array_equal(labels, labels2)

for classification in clf.classifications:
	color = colors[classification]
	for featureset in clf.classifications[classification]:
		plt.scatter(featureset[0], featureset[1], marker='x', color=color, s=150, linewidths=5)


unknowns = np.array([[1,3],
					 [8,9],
					 [0,3],
					 [5,4],
					 [6,4]])

for unknown in unknowns:
	classification = clf.predict(unknown)
	plt.scatter(unknown[0], unknown[1], marker='*', color=colors[classification])

plt.show()











Exemple #37
0
kmeans2 = KMeans(n_clusters=3, init=km_clcentr[:])
kmeans2.fit(seed_data)
for i in range(datacount):
	kmeans2.labels_[i] += 1

print kmeans2.labels_[:]
# meanshift clustering
bw = estimate_bandwidth(seed_data, quantile=0.2)
#print "MeanShift bandwidth:", bw
ms = MeanShift(bandwidth=bw, bin_seeding=True)
ms.fit(seed_data)
#print ms.labels_[:]

#print seed_res
pred = ms.predict(seed_data)
for i in range(datacount):
	if pred[i] == 0:
		pred[i] = 3

print ms.labels_[:]

print "seedresult-Kmeans accuracy:", accuracy_score(seed_res, kmeans2.labels_)
print "seedresult-Meanshift accuracy:", accuracy_score(seed_res, pred)
print "Kmeans-Meanshift accuracy:", accuracy_score(kmeans2.labels_, pred)

#compdict = []
#for i in range(datacount):
#	compdict.append([seed_res[i], pred[i]])