def template_clustering_random_points_performance(cluster_length,
                                                  amount_clusters, ccore_flag):
    sample = [[random.random(), random.random()]
              for _ in range(cluster_length)]
    for index in range(1, amount_clusters):
        default_offset = 5
        sample += [[
            random.random() + default_offset * index,
            random.random() + default_offset * index
        ] for _ in range(cluster_length)]

    initial_center = [[random.random(), random.random()],
                      [random.random(), random.random()]]
    xmeans_instance = xmeans(sample, initial_center, 20, 0.25,
                             splitting_type.BAYESIAN_INFORMATION_CRITERION,
                             ccore_flag)

    ticks_array = []
    amount_measures = 5

    for _ in range(amount_measures):
        xmeans_instance = xmeans(sample, initial_center, 20, 0.25,
                                 splitting_type.BAYESIAN_INFORMATION_CRITERION,
                                 ccore_flag)
        (ticks, _) = timedcall(xmeans_instance.process)

        ticks_array.append(ticks)

    print(
        "Random sample: (size:" + str(len(sample)) + ") ', Execution time: '",
        sum(ticks_array) / amount_measures)
    def random_state(ccore_flag, kinitial, kmax, random_state):
        data = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE5)

        initial_centers = random_center_initializer(data, kinitial, random_state=random_state).initialize()
        xmeans_instance1 = xmeans(data, initial_centers, kmax, ccore=ccore_flag, random_state=random_state).process()
        xmeans_instance2 = xmeans(data, initial_centers, kmax, ccore=ccore_flag, random_state=random_state).process()

        assertion.eq(xmeans_instance1.get_total_wce(), xmeans_instance2.get_total_wce())
        assertion.eq(xmeans_instance1.get_centers(), xmeans_instance2.get_centers())
        assertion.eq(xmeans_instance1.get_clusters(), xmeans_instance2.get_clusters())
Exemple #3
0
def x_means(X, num_init_clusters=8, visualize=True):
    from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
    from pyclustering.cluster.xmeans import xmeans
    from pyclustering.cluster import cluster_visualizer
    from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
    from pyclustering.cluster import cluster_visualizer_multidim

    X = list(X)

    start_centers = kmeans_plusplus_initializer(
        X, num_init_clusters).initialize()

    xmeans_instance = xmeans(X, start_centers, 32, ccore=True, criterion=0)

    # Run cluster analysis and obtain results.
    xmeans_instance.process()
    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    print('Number of cluster centers calculated :', len(centers))

    if visualize:
        visualizer = cluster_visualizer_multidim()
        visualizer.append_clusters(clusters, X)
        visualizer.show()
    return centers, clusters
    def xmeans_cluster(self, domain_features):
        final_centers = None
        final_radiuses = None
        final_clusters = None
        for i in range(5):
            initial_centers = kmeans_plusplus_initializer(domain_features,
                                                          2).initialize()
            # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
            max_num = int(len(domain_features) / 2)
            xmeans_instance = xmeans(domain_features, initial_centers, max_num)
            xmeans_instance.process()
            centers = xmeans_instance.get_centers()
            flag = False
            if i == 0 or len(centers) > len(final_centers):
                flag = True
            if flag:
                radiuses = []
                cluster_num = 1
                for cluster in xmeans_instance.get_clusters():
                    cluster_num = cluster_num + 1
                    radius_total = 0.0
                    for i in cluster:
                        dist = np.linalg.norm(domain_features[i] -
                                              centers[cluster_num - 2])
                        radius_total += dist
                    radiuses.append(radius_total / len(cluster))
                final_centers = xmeans_instance.get_centers()
                final_radiuses = radiuses
                final_clusters = xmeans_instance.get_clusters()

        return final_centers, final_radiuses, final_clusters
def template_clustering(
        start_centers,
        path,
        tolerance=0.025,
        criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
        ccore=True):
    sample = read_sample(path)

    xmeans_instance = xmeans(sample,
                             start_centers,
                             20,
                             tolerance,
                             criterion,
                             ccore,
                             repeat=5)
    (ticks, _) = timedcall(xmeans_instance.process)

    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
        criterion_string = "BAYESIAN INFORMATION CRITERION"
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
        criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH"

    print("Sample: ", ntpath.basename(path), "\nInitial centers: '",
          (start_centers is not None), "', Execution time: '", ticks,
          "', Number of clusters:", len(clusters), ",", criterion_string, "\n")

    visualizer = cluster_visualizer()
    visualizer.set_canvas_title(criterion_string)
    visualizer.append_clusters(clusters, sample)
    visualizer.append_cluster(centers, None, marker='*')
    visualizer.show()
    def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore):
        sample = None;
        if (isinstance(input_sample, str)):
            sample = read_sample(input_sample);
        else:
            sample = input_sample;
        
        #clusters = xmeans(sample, start_centers, 20, ccore);
        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore);
        xmeans_instance.process();
         
        clusters = xmeans_instance.get_clusters();
        centers = xmeans_instance.get_centers();
    
        obtained_cluster_sizes = [len(cluster) for cluster in clusters];

        assert len(sample) == sum(obtained_cluster_sizes);
        assert len(clusters) == len(centers);
        assert len(centers) <= kmax;
        
        if (expected_cluster_length is not None):
            assert len(centers) == len(expected_cluster_length);
            
            obtained_cluster_sizes.sort();
            expected_cluster_length.sort();
            
            assert obtained_cluster_sizes == expected_cluster_length;
Exemple #7
0
    def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster,
                                     offset, kinitial, kmax):
        input_data = []
        for index in range(amount_clusters):
            for _ in range(size_cluster):
                input_data.append([
                    random.random() * index * offset,
                    random.random() * index * offset
                ])

        initial_centers = random_center_initializer(input_data,
                                                    kinitial).initialize()
        xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025,
                                 splitting_type.BAYESIAN_INFORMATION_CRITERION,
                                 ccore_flag)
        xmeans_instance.process()

        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()

        if len(clusters) != len(centers):
            print(input_data)
            print(initial_centers)

        assertion.ge(kmax, len(clusters))
        assertion.ge(kmax, len(centers))
        assertion.eq(len(clusters), len(centers))
Exemple #8
0
def template_clustering(
        start_centers,
        path,
        tolerance=0.025,
        criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
        ccore=False):
    sample = read_sample(
        '/home/tengmo/crawler_to_server_set_time/crawler/source_code_python2.7/cluster/test.txt'
    )

    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion,
                             ccore)
    (ticks, result) = timedcall(xmeans_instance.process)

    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
        criterion_string = "BAYESIAN INFORMATION CRITERION"
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
        criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH"

    #print("Sample: ", path, "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n");
    print {'length': len(clusters), 'clus': clusters, 'cen': centers}
Exemple #9
0
    def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore, **kwargs):
        if isinstance(input_sample, str):
            sample = read_sample(input_sample)
        else:
            sample = input_sample

        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore, **kwargs)
        xmeans_instance.process()
         
        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()
        wce = xmeans_instance.get_total_wce()
    
        obtained_cluster_sizes = [len(cluster) for cluster in clusters]

        assertion.eq(len(sample), sum(obtained_cluster_sizes))
        assertion.eq(len(clusters), len(centers))
        assertion.le(len(centers), kmax)

        expected_wce = 0.0
        metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
        for index_cluster in range(len(clusters)):
            for index_point in clusters[index_cluster]:
                expected_wce += metric(sample[index_point], centers[index_cluster])

        assertion.eq(expected_wce, wce)

        if expected_cluster_length is not None:
            assertion.eq(len(centers), len(expected_cluster_length))

            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()
            
            assertion.eq(obtained_cluster_sizes, expected_cluster_length)
    def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore):
        if isinstance(input_sample, str):
            sample = read_sample(input_sample)
        else:
            sample = input_sample
        
        #clusters = xmeans(sample, start_centers, 20, ccore);
        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore)
        xmeans_instance.process()
         
        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()
    
        obtained_cluster_sizes = [len(cluster) for cluster in clusters]

        assert len(sample) == sum(obtained_cluster_sizes);
        assert len(clusters) == len(centers);
        assert len(centers) <= kmax;
        
        if expected_cluster_length is not None:
            assert len(centers) == len(expected_cluster_length);

            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()
            
            assert obtained_cluster_sizes == expected_cluster_length;
Exemple #11
0
def template_clustering(
        start_centers,
        path,
        tolerance=0.025,
        criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
        ccore=False):
    sample = read_sample(path)

    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion,
                             ccore)
    (ticks, result) = timedcall(xmeans_instance.process)

    clusters = xmeans_instance.get_clusters()

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
        criterion_string = "BAYESIAN_INFORMATION_CRITERION"
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
        criterion_string = "MINIMUM_NOISELESS_DESCRIPTION_LENGTH"

    print("Sample: ", path, "\nInitial centers: '",
          (start_centers is not None), "', Execution time: '", ticks,
          "', Number of clusters:", len(clusters), ",", criterion_string, "\n")

    draw_clusters(sample, clusters)
def cluster_xMean_FixSize(binsList, amount_initial_centers=3, kmax=10):
    sample = []
    means = []
    vars = []
    slopes = []
    for i, bin in enumerate(binsList):
        sample.append(
            [bin.get_representation(),
             bin.get_variance(),
             bin.get_slope()])
        means.append(bin.get_representation())
        vars.append(bin.get_variance())
        slopes.append(bin.get_slope())

    # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
    # start analysis.
    initial_centers = kmeans_plusplus_initializer(
        sample, amount_initial_centers).initialize()
    # # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
    # # number of clusters that can be allocated is 20.
    xmeans_instance = xmeans(sample, initial_centers, kmax)
    xmeans_instance.process()
    # # Extract clustering results: clusters and their centers
    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    print(len(clusters))

    return {"clusters": clusters, "centers": centers}
def process_xmeans(sample):
    instance = xmeans(
        sample, [[random() + (multiplier * 5),
                  random() + (multiplier + 5)]
                 for multiplier in range(NUMBER_CLUSTERS)])
    (ticks, _) = timedcall(instance.process)
    return ticks
def template_clustering(
        start_centers,
        path,
        tolerance=0.025,
        criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION,
        ccore=False):
    sample = read_sample(path)
    start = time.clock()
    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion,
                             ccore)
    (ticks, _) = timedcall(xmeans_instance.process)

    # clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()
    end = time.clock()
    print(end - start)

    criterion_string = "UNKNOWN"
    if criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION:
        criterion_string = "BAYESIAN INFORMATION CRITERION"
    elif criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH:
        criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH"

    print("Sample: ", ntpath.basename(path), "\nInitial centers: '",
          (start_centers is not None), "', Execution time: '", ticks,
          "', Number of clusters:", len(centers), ",", criterion_string, "\n")
Exemple #15
0
 def testCoreInterfaceIntInputData(self):
     xmeans_instance = xmeans([[1], [2], [3], [20], [21], [22]],
                              [[2], [21]],
                              5,
                              ccore=True)
     xmeans_instance.process()
     assert len(xmeans_instance.get_clusters()) == 2
Exemple #16
0
    def templateLengthProcessData(input_sample, start_centers,
                                  expected_cluster_length, type_splitting,
                                  kmax, ccore):
        if isinstance(input_sample, str):
            sample = read_sample(input_sample)
        else:
            sample = input_sample

        #clusters = xmeans(sample, start_centers, 20, ccore);
        xmeans_instance = xmeans(sample, start_centers, kmax, 0.025,
                                 type_splitting, ccore)
        xmeans_instance.process()

        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()

        obtained_cluster_sizes = [len(cluster) for cluster in clusters]

        assert len(sample) == sum(obtained_cluster_sizes)
        assert len(clusters) == len(centers)
        assert len(centers) <= kmax

        if expected_cluster_length is not None:
            assert len(centers) == len(expected_cluster_length)

            obtained_cluster_sizes.sort()
            expected_cluster_length.sort()

            assert obtained_cluster_sizes == expected_cluster_length
Exemple #17
0
def plot_unnecessary_part_by_clustering(in_mask_path,
                                        in_img_path,
                                        out_img_path,
                                        show_flag=0):
    """ディティールマスク部を画像に矩形として表示する
    【引数】
        in_mask_path: 入力するディティールマスクのパス
        in_img_path: 矩形を重ねたいフレーム画像のパス
        out_img_path: 出力する矩形描画後の画像のパス
        show_flag: マスク画像生成後、画像を表示するかどうか。0だと表示しない。1だと表示する。

    【返り値】
        なし

    """
    in_mask = cv2.imread(in_mask_path, cv2.IMREAD_GRAYSCALE)

    height = in_mask.shape[0]
    width = in_mask.shape[1]

    X = []

    print('Checking unnecessary part...')
    for i in tqdm(range(height)):
        for j in range(width):
            if (in_mask[i, j] == 255):
                X.append([i, j])

    initializer = xmeans.kmeans_plusplus_initializer(data=X, amount_centers=2)
    initial_centers = initializer.initialize()
    xm = xmeans.xmeans(data=X, initial_centers=initial_centers)
    xm.process()
    clusters = xm.get_clusters()

    img_out = cv2.imread(in_img_path)
    mask = create_blank_mask(height, width)
    print('Clustering unnecessary part...')
    for cluster in tqdm(clusters):
        coodinates = []
        for item in cluster:
            coodinates.append(X[item])

        x, y, width, height = cv2.boundingRect(np.array(coodinates))
        img_out = cv2.rectangle(img_out, (y, x), (y + height, x + width),
                                (0, 0, 255), 2)
        for i in range(y, y + height):
            for j in range(x, x + width):
                mask[j, i] = 255

    cv2.imwrite(out_img_path, img_out)
    if show_flag == 0:
        return
    elif show_flag == 1:
        cv2.imshow('window', img_out)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        return
    else:
        return
    def score_embeddings(self, min_length, max_num_speakers, mode):
        """ Score embeddings.

        Args:
            min_length (int): minimal length of segment used for clustering in miliseconds
            max_num_speakers (int): maximal number of speakers
            mode (str): running mode, see examples/diarization.py for details

        Returns:
            dict: dictionary with scores for each file
        """
        result_dict = {}
        logger.info('Scoring using `{}`.'.format(
            'PLDA' if self.plda is not None else 'cosine distance'))
        for embedding_set in self.embeddings:
            name = os.path.normpath(embedding_set.name)
            embeddings_all = embedding_set.get_all_embeddings()
            embeddings_long = embedding_set.get_longer_embeddings(min_length)
            if len(embeddings_long) == 0:
                logger.warning(
                    f'No embeddings found longer than {min_length} for embedding set `{name}`.'
                )
                continue
            size = len(embedding_set)
            if size > 0:
                logger.info(
                    f'Clustering `{name}` using {len(embeddings_long)} long embeddings.'
                )
                if mode == 'diarization':
                    if embedding_set.num_speakers is not None:
                        num_speakers = embedding_set.num_speakers
                    else:
                        xm = xmeans(embeddings_long, kmax=max_num_speakers)
                        xm.process()
                        num_speakers = len(xm.get_clusters())

                    centroids = self.run_clustering(num_speakers,
                                                    embeddings_long)
                    if self.norm is None:
                        if self.plda is None:
                            result_dict[name] = cosine_similarity(
                                embeddings_all, centroids).T
                        else:
                            result_dict[name] = self.plda.score(
                                embeddings_all, centroids)
                    else:
                        result_dict[name] = self.norm.s_norm(
                            embeddings_all, centroids)
                else:
                    clusters = []
                    for k in range(1, MAX_SRE_CLUSTERS + 1):
                        if size >= k:
                            centroids = self.run_clustering(k, embeddings_long)
                            clusters.extend(x for x in centroids)
                    result_dict[name] = np.array(clusters)
            else:
                logger.warning(
                    f'No embeddings to score in `{embedding_set.name}`.')
        return result_dict
Exemple #19
0
def cl_xmeans(sample):
    initial_centers = kmeans_plusplus_initializer(sample, 2).initialize()
    xmeans_instance = xmeans(sample, initial_centers, 20)
    xmeans_instance.process()
    return xmeans_instance.get_clusters()


# slc: single linkage clustering
    def xmeansRoutine(self):

        self.initial_centers = kmeans_plusplus_initializer(
            self.datalist, self.amount_initial_centers).initialize()
        self.xmeans_instance = xmeans(self.datalist, self.initial_centers,
                                      self.amount_max_centers)
        self.xmeans_instance.process()
        self.clusters = self.xmeans_instance.get_clusters()
        self.centers = self.xmeans_instance.get_centers()
Exemple #21
0
    def score_ivec(self, min_length, max_num_speakers, num_threads):
        """ Score i-vectors.

        Args:
            min_length (int): minimal length of segment used for clustering in miliseconds
            max_num_speakers (int): maximal number of speakers
            num_threads (int): number of threads to use

        Returns:
            dict: dictionary with scores for each file
        """
        scores_dict = {}
        for ivecset in self.ivecs:
            name = os.path.normpath(ivecset.name)
            ivecs_all = ivecset.get_all()
            ivecs_long = ivecset.get_longer(min_length)
            loginfo('Scoring {} ...'.format(name))
            size = ivecset.size()
            if size > 0:
                if ivecset.num_speakers is not None:
                    num_speakers = ivecset.num_speakers
                    sklearnkmeans = sklearnKMeans(
                        n_clusters=num_speakers,
                        n_init=100,
                        n_jobs=num_threads).fit(ivecs_long)
                    if self.plda is None:
                        centroids = sklearnkmeans.cluster_centers_
                    else:
                        centroids = PLDAKMeans(sklearnkmeans.cluster_centers_,
                                               num_speakers,
                                               self.plda).fit(ivecs_long)
                else:
                    xm = xmeans(ivecs_long, kmax=max_num_speakers)
                    xm.process()
                    num_speakers = len(xm.get_clusters())
                    sklearnkmeans = sklearnKMeans(
                        n_clusters=num_speakers,
                        n_init=100,
                        n_jobs=num_threads).fit(ivecs_long)
                    centroids = sklearnkmeans.cluster_centers_
                if self.norm is None:
                    if self.plda is None:
                        ivecs_all = Utils.l2_norm(ivecs_all)
                        centroids = Utils.l2_norm(centroids)
                        scores_dict[name] = cosine_similarity(
                            ivecs_all, centroids).T
                    else:
                        scores_dict[name] = self.plda.score(
                            ivecs_all, centroids)
                else:
                    ivecs_all = Utils.l2_norm(ivecs_all)
                    centroids = Utils.l2_norm(centroids)
                    scores_dict[name] = self.norm.s_norm(ivecs_all, centroids)
            else:
                logwarning('No i-vectors to score in {}.'.format(ivecset.name))
        return scores_dict
 def templateClusterAllocationOneDimensionData(self, ccore_flag):
     input_data = [ [0.0] for i in range(10) ] + [ [5.0] for i in range(10) ] + [ [10.0] for i in range(10) ] + [ [15.0] for i in range(10) ];
        
     xmeans_instance = xmeans(input_data, [ [0.5], [5.5], [10.5], [15.5] ], 20, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag);
     xmeans_instance.process();
     clusters = xmeans_instance.get_clusters();
        
     assert len(clusters) == 4;
     for cluster in clusters:
         assert len(cluster) == 10;
def template_clustering_random_points_performance(cluster_length, amount_clusters, ccore_flag):
    sample = [ [ random.random(), random.random() ] for _ in range(cluster_length) ]
    for index in range(1, amount_clusters):
        default_offset = 5
        sample += [ [ random.random() + default_offset * index, random.random() + default_offset * index ] for _ in range(cluster_length) ]
    
    initial_center = [ [ random.random(), random.random() ], [ random.random(), random.random() ] ]
    xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag)
    
    ticks_array = []
    amount_measures = 5
    
    for _ in range(amount_measures):
        xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag)
        (ticks, _) = timedcall(xmeans_instance.process)
        
        ticks_array.append(ticks)
    
    print("Random sample: (size:" + str(len(sample)) + ") ', Execution time: '", sum(ticks_array) / amount_measures)
Exemple #24
0
 def templateClusterAllocationOneDimensionData(self, ccore_flag):
     input_data = [ [0.0] for i in range(10) ] + [ [5.0] for i in range(10) ] + [ [10.0] for i in range(10) ] + [ [15.0] for i in range(10) ];
        
     xmeans_instance = xmeans(input_data, [ [0.5], [5.5], [10.5], [15.5] ], 20, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag);
     xmeans_instance.process();
     clusters = xmeans_instance.get_clusters();
        
     assert len(clusters) == 4;
     for cluster in clusters:
         assert len(cluster) == 10;
Exemple #25
0
    def __run_feature_xmeans(self, features, num_init_centers = 10, max_centers = 30, \
        clust_size_threshold = 1, dist_threshold = 10) -> list:

        # run xmeans algorithm
        initial_centers = kmeans_plusplus_initializer(
            features, num_init_centers).initialize()
        algo = xmeans(features,
                      initial_centers=initial_centers,
                      kmax=max_centers)
        algo.process()
        centroids, clusters = algo.get_centers(), algo.get_clusters()

        # pre-process centroids
        p_centroids = []
        for coord in centroids:
            row, col = coord[0], coord[1]
            p_centroids.append((int(round(row)), int(round(col))))

        # determine close centroids
        comb_indices = set()
        for comb in itertools.combinations(range(len(p_centroids)), 2):
            cen, c_cen = p_centroids[comb[0]], p_centroids[comb[1]]
            dist = math.sqrt((cen[0] - c_cen[0])**2 + (cen[1] - c_cen[1])**2)
            if dist <= dist_threshold: comb_indices.add(frozenset(comb))

        # find transitive centroid clusters
        trans_centroids = []
        for comb in comb_indices:
            addedFlag = False
            for i in range(len(trans_centroids)):
                if len(trans_centroids[i].intersection(comb)):
                    trans_centroids[i] = trans_centroids[i].union(comb)
                    addedFlag = True
                    break
            if not addedFlag: trans_centroids.append(frozenset(comb))

        # combine close transitive centroids sets
        c_centroids, added_indices = [], set()
        for combs in trans_centroids:
            n_centroid = [0, 0]
            for c_idx in combs:
                added_indices.add(c_idx)
                n_centroid[0] += centroids[c_idx][0]
                n_centroid[1] += centroids[c_idx][1]
            n_centroid[0] /= len(combs)
            n_centroid[1] /= len(combs)
            c_centroids.append(n_centroid)

        # purge under-sized clusters
        for c_idx in range(len(centroids)):
            if c_idx in added_indices or len(clusters[c_idx]) \
                <= clust_size_threshold:
                continue
            c_centroids.append(centroids[c_idx])
        return c_centroids
def template_clustering_performance(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
    sample = read_sample(path)
    
    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore)
    (ticks, _) = timedcall(xmeans_instance.process)

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION";
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH";
    
    print("Sample: ", ntpath.basename(path), "', Execution time: '", ticks, "',", criterion_string)
def est_num_clusters(embs, max_num, init_num):
    """Use xmeans to estimate number of speakers."""

    embs_list = embs.tolist()
    initial_centers = kmeans_plusplus_initializer(embs_list, init_num).initialize()
    xm = xmeans(embs_list, initial_centers, kmax=max_num, ccore=True)
    xm.process()
    num_speakers = len(xm.get_clusters())
    print('Estimated number of speakers: ' + str(num_speakers))

    return num_speakers
def template_clustering_performance(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
    sample = read_sample(path)
    
    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore)
    (ticks, _) = timedcall(xmeans_instance.process)

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION";
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH";
    
    print("Sample: ", ntpath.basename(path), "', Execution time: '", ticks, "',", criterion_string)
    def fit(self, X, y=None):

        initial_number_of_cluster_centers = self.min_clusters

        initial_centers = kmeans_plusplus_initializer(
            X, initial_number_of_cluster_centers).initialize()

        self.xmeans_instance = xmeans(X, initial_centers, self.max_clusters)

        self.xmeans_instance.process()

        return self
Exemple #30
0
    def templatePredict(path_to_file, initial_centers, points, expected_amount, expected_closest_clusters, ccore, **kwargs):
        sample = read_sample(path_to_file)

        kmax = kwargs.get('kmax', 20)

        xmeans_instance = xmeans(sample, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore)
        xmeans_instance.process()

        closest_clusters = xmeans_instance.predict(points)
        assertion.eq(expected_amount, len(xmeans_instance.get_clusters()))
        assertion.eq(len(expected_closest_clusters), len(closest_clusters))
        assertion.true(numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))
Exemple #31
0
def make_range_mask(in_mask_path, out_mask_path, show_flag=0):
    """x-meansを使ってレンジマスクを生成する
    【引数】
        in_mask_path: 入力するディティールマスク画像のパス
        out_mask_path: 出力するレンジマスク画像のパス
        show_flag: マスク画像生成後、画像を表示するかどうか。0だと表示しない。1だと表示する。

    【返り値】
        なし

    """
    in_mask = cv2.imread(in_mask_path, cv2.IMREAD_GRAYSCALE)

    height = in_mask.shape[0]
    width = in_mask.shape[1]

    X = []

    print('Checking unnecessary part...')
    for i in tqdm(range(height)):
        for j in range(width):
            if (in_mask[i, j] == 255):
                X.append([i, j])

    initializer = xmeans.kmeans_plusplus_initializer(data=X, amount_centers=2)
    initial_centers = initializer.initialize()
    xm = xmeans.xmeans(data=X, initial_centers=initial_centers)
    xm.process()
    clusters = xm.get_clusters()

    mask = create_blank_mask(height, width)
    print('Clustering unnecessary part...')
    for cluster in tqdm(clusters):
        coodinates = []
        for item in cluster:
            coodinates.append(X[item])

        x, y, width, height = cv2.boundingRect(np.array(coodinates))

        for i in range(y, y + height):
            for j in range(x, x + width):
                mask[j, i] = 255

    cv2.imwrite(out_mask_path, mask)
    if show_flag == 0:
        return
    elif show_flag == 1:
        cv2.imshow('window', mask)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        return
    else:
        return
    def score_embeddings(self, min_length, max_num_speakers):
        """ Score embeddings.

        Args:
            min_length (int): minimal length of segment used for clustering in miliseconds
            max_num_speakers (int): maximal number of speakers

        Returns:
            dict: dictionary with scores for each file
        """
        scores_dict = {}
        logger.info('Scoring using `{}`.'.format('PLDA' if self.plda is not None else 'cosine distance'))
        for embedding_set in self.embeddings:
            name = os.path.normpath(embedding_set.name)
            embeddings_all = embedding_set.get_all_embeddings()
            embeddings_long = embedding_set.get_longer_embeddings(min_length)
            if len(embeddings_long) == 0:
                logger.warning(
                    'No embeddings found longer than {} for embedding set `{}`.'.format(min_length, name))
                continue
            size = len(embedding_set)
            if size > 0:
                logger.info('Clustering `{}` using {} long embeddings.'.format(name, len(embeddings_long)))
                if embedding_set.num_speakers is not None:
                    num_speakers = embedding_set.num_speakers
                    if self.use_l2_norm:
                        kmeans_clustering = SphericalKMeans(
                            n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long)
                    else:
                        kmeans_clustering = sklearnKMeans(
                            n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long)
                    if self.plda is None:
                        centroids = kmeans_clustering.cluster_centers_
                    else:
                        centroids = PLDAKMeans(
                            kmeans_clustering.cluster_centers_, num_speakers, self.plda).fit(embeddings_long)
                else:
                    xm = xmeans(embeddings_long, kmax=max_num_speakers)
                    xm.process()
                    num_speakers = len(xm.get_clusters())
                    kmeans_clustering = sklearnKMeans(
                        n_clusters=num_speakers, n_init=100, n_jobs=1).fit(embeddings_long)
                    centroids = kmeans_clustering.cluster_centers_
                if self.norm is None:
                    if self.plda is None:
                        scores_dict[name] = cosine_similarity(embeddings_all, centroids).T
                    else:
                        scores_dict[name] = self.plda.score(embeddings_all, centroids)
                else:
                    scores_dict[name] = self.norm.s_norm(embeddings_all, centroids)
            else:
                logger.warning('No embeddings to score in `{}`.'.format(embedding_set.name))
        return scores_dict
Exemple #33
0
 def xmeans_model(self, sample):
     amount_initial_centers = 2
     initial_centers = kmeans_plusplus_initializer(
         sample, amount_initial_centers).initialize()
     # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
     max_num = int(len(sample) / 2)
     xmeans_instance = xmeans(sample, initial_centers, max_num)
     xmeans_instance.process()
     # Extract clustering results: clusters and their centers
     clusters = xmeans_instance.get_clusters()
     centers = xmeans_instance.get_centers()
     return clusters, centers
Exemple #34
0
def get_x_clusters(doc_vectors):
    # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will
    # start analysis.
    amount_initial_centers = 2
    initial_centers = kmeans_plusplus_initializer(
        doc_vectors, amount_initial_centers).initialize()
    # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum
    # number of clusters that can be allocated is 20.
    xmeans_instance = xmeans(doc_vectors, initial_centers, 20)
    xmeans_instance.process()
    # Extract clustering results: clusters and their centers
    clusters = xmeans_instance.get_clusters()
    return clusters
def template_clustering(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
    sample = read_sample(path);
    
    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore);
    (ticks, result) = timedcall(xmeans_instance.process);
    
    clusters = xmeans_instance.get_clusters();

    criterion_string = "UNKNOWN";
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN_INFORMATION_CRITERION";
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM_NOISELESS_DESCRIPTION_LENGTH";
    
    print("Sample: ", path, "\tExecution time: ", ticks, "Number of clusters: ", len(clusters), criterion_string, "\n");

    draw_clusters(sample, clusters);
 def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length, type_splitting, ccore = False):
     sample = read_sample(path_to_file);
     
     #clusters = xmeans(sample, start_centers, 20, ccore);
     xmeans_instance = xmeans(sample, start_centers, 20, 0.025, type_splitting, ccore);
     xmeans_instance.process();
      
     clusters = xmeans_instance.get_clusters();
 
     obtained_cluster_sizes = [len(cluster) for cluster in clusters];
     assert len(sample) == sum(obtained_cluster_sizes);
     
     obtained_cluster_sizes.sort();
     expected_cluster_length.sort();
     assert obtained_cluster_sizes == expected_cluster_length;
def main():
    file_path = "/home/joli/Downloads/Av_CP343-CP348_ft.sp"
    frequencies, intensities = read_data(file_path)
    max_x = max(frequencies)
    max_y = (max(intensities))
    avg = sum(intensities) / len(intensities)

    # Marie's Function
    plot.plot(frequencies, intensities, color='grey')
    # test(peak_finder, frequencies, intensities, 0.2)
    #
    # # Kyle's Function
    # plot.plot(frequencies, intensities, color='grey')
    # test(k_peak_finder, frequencies, intensities, 3)
    # # Using Peak Utils
    # plot.axhline(linewidth=4,y=avg, color='green', xmin=0, xmax=max_x)
    # plot.plot(frequencies, intensities, 'bo', color='grey')
    # plot.plot(frequencies, intensities, 'bo', color='grey')
    # test(peak_finder_peakutils, frequencies, intensities, 0.0001)

    # Optimized
    # test(optimized_peak_finder, frequencies, intensities)


    # Test Difference
    # f1,i1 = peak_finder(frequencies, intensities, 1)
    f2, i2 = k_peak_finder(frequencies, intensities, 5)
    sample = read_sample("/home/joli/PycharmProjects/Experiments/168-175_pzf1.sp")
    instance = xmeans.xmeans(sample, [3.7, 5.5])
    instance.process()
    clusters = instance.get_clusters()
    print clusters
    # A = set(f1)
    # B = set(f2)
    # C = A&B
    #
    # print len(f1)
    # print len(f2)
    # print len(C)
    # plot.bar(f1, i1, color="purple")
    plot.bar(f2, i2, color="green", width=1)
    # plot.bar(C, [1] * len(C), color="yellow", bottom=-1)
    plot.show()
    def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster, offset, kinitial, kmax):
        input_data = []
        for index in range(amount_clusters):
            for _ in range(size_cluster):
                input_data.append([random.random() * index * offset, random.random() * index * offset])
        
        initial_centers = random_center_initializer(input_data, kinitial).initialize()
        xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag)
        xmeans_instance.process()
        
        clusters = xmeans_instance.get_clusters()
        centers = xmeans_instance.get_centers()

        if len(clusters) != len(centers):
            print(input_data)
            print(initial_centers)

        assertion.ge(kmax, len(clusters))
        assertion.ge(kmax, len(centers))
        assertion.eq(len(clusters), len(centers))
def template_clustering(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
    sample = read_sample(path)
    
    xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore)
    (ticks, _) = timedcall(xmeans_instance.process)
    
    clusters = xmeans_instance.get_clusters()
    centers = xmeans_instance.get_centers()

    criterion_string = "UNKNOWN"
    if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION";
    elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH";
    
    print("Sample: ", ntpath.basename(path), "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n")

    visualizer = cluster_visualizer()
    visualizer.set_canvas_title(criterion_string)
    visualizer.append_clusters(clusters, sample)
    visualizer.append_cluster(centers, None, marker = '*')
    visualizer.show()
Exemple #40
0
 def testCoreInterfaceIntInputData(self):
     xmeans_instance = xmeans([ [1], [2], [3], [20], [21], [22] ], [ [2], [21] ], 5, ccore = True);
     xmeans_instance.process();
     assert len(xmeans_instance.get_clusters()) == 2;
def process_xmeans(sample):
    instance = xmeans(sample, [ [random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS) ])
    (ticks, _) = timedcall(instance.process)
    return ticks