Ejemplo n.º 1
0
    def templateClusterAllocationTheSameObjects(number_objects, number_clusters, ccore_flag = False):
        value = random()
        input_data = [ [value] ] * number_objects
        
        initial_medoids = []
        step = int(math.floor(number_objects / number_clusters))
        for i in range(number_clusters):
            initial_medoids.append(i * step)
        
        kmedoids_instance = kmedoids(input_data, initial_medoids, ccore=ccore_flag)
        kmedoids_instance.process()
        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        assertion.eq(len(clusters), len(medoids))
        assertion.eq(len(set(medoids)), len(medoids))
        
        object_mark = [False] * number_objects
        allocated_number_objects = 0
        
        for cluster in clusters:
            for index_object in cluster: 
                assertion.eq(False, object_mark[index_object])    # one object can be in only one cluster.
                
                object_mark[index_object] = True
                allocated_number_objects += 1
            
        assertion.eq(number_objects, allocated_number_objects)    # number of allocated objects should be the same.
Ejemplo n.º 2
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Ejemplo n.º 3
0
 def templateClusterAllocationOneDimensionData(self):
     input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ];
     
     kmedians_instance = kmedoids(input_data, [ 5, 15, 25, 35 ], 0.025);
     kmedians_instance.process();
     clusters = kmedians_instance.get_clusters();
     
     assert len(clusters) == 4;
     for cluster in clusters:
         assert len(cluster) == 10;
Ejemplo n.º 4
0
 def templateClusterAllocationOneDimensionData(ccore_flag):
     input_data = [[random()] for i in range(10)] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]
      
     kmedoids_instance = kmedoids(input_data, [ 5, 15, 25, 35 ], 0.025, ccore_flag)
     kmedoids_instance.process()
     clusters = kmedoids_instance.get_clusters()
      
     assertion.eq(4, len(clusters))
     for cluster in clusters:
         assertion.eq(10, len(cluster))
Ejemplo n.º 5
0
def template_clustering(start_medoids, path, tolerance = 0.25):
    sample = read_sample(path);
    
    kmedoids_instance = kmedoids(sample, start_medoids, tolerance);
    (ticks, result) = timedcall(kmedoids_instance.process);
    
    clusters = kmedoids_instance.get_clusters();
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n");

    draw_clusters(sample, clusters);
Ejemplo n.º 6
0
 def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length):
     sample = read_sample(path_to_file);
     
     kmedoids_instance = kmedoids(sample, start_centers, 0.025);
     kmedoids_instance.process();
     
     clusters = kmedoids_instance.get_clusters();
 
     obtained_cluster_sizes = [len(cluster) for cluster in clusters];
     assert len(sample) == sum(obtained_cluster_sizes);
     
     obtained_cluster_sizes.sort();
     expected_cluster_length.sort();
     assert obtained_cluster_sizes == expected_cluster_length;
Ejemplo n.º 7
0
def template_clustering(start_medoids, path, tolerance = 0.25, show = True):
    sample = read_sample(path);
    
    kmedoids_instance = kmedoids(sample, start_medoids, tolerance);
    (ticks, result) = timedcall(kmedoids_instance.process);
    
    clusters = kmedoids_instance.get_clusters();
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n");

    if (show is True):
        visualizer = cluster_visualizer(1);
        visualizer.append_clusters(clusters, sample, 0);
        visualizer.show();
    
    return (sample, clusters);
Ejemplo n.º 8
0
def template_clustering(start_medoids, path, tolerance = 0.25, show = True):
    sample = read_sample(path)
    
    kmedoids_instance = kmedoids(sample, start_medoids, tolerance)
    (ticks, result) = timedcall(kmedoids_instance.process)
    
    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, sample, 0)
        visualizer.append_cluster([sample[index] for index in start_medoids], marker='*', markersize=15)
        visualizer.append_cluster(medoids, data=sample, marker='*', markersize=15)
        visualizer.show()
    
    return sample, clusters
Ejemplo n.º 9
0
 def templateAllocateRequestedClusterAmount(data, amount_clusters, initial_medoids, ccore_flag):
     if initial_medoids is None:
         initial_medoids = []
         for _ in range(amount_clusters):
             index_point = randint(0, len(data) - 1)
             while (index_point in initial_medoids):
                 index_point = randint(0, len(data) - 1)
             
             initial_medoids.append(index_point)
         
     kmedoids_instance = kmedoids(data, initial_medoids, 0.025, ccore = ccore_flag)
     kmedoids_instance.process()
     clusters = kmedoids_instance.get_clusters()
     
     assertion.eq(len(clusters), amount_clusters)
     amount_objects = 0
     for cluster in clusters:
         amount_objects += len(cluster)
     
     assertion.eq(amount_objects, len(data))
Ejemplo n.º 10
0
 def templateClusterAllocationTheSameObjects(self, number_objects, number_clusters, ccore_flag = False):
     value = random();
     input_data = [ [value] ] * number_objects;
     
     initial_medoids = [];
     step = math.floor(number_objects / number_clusters);
     for i in range(number_clusters):
         initial_medoids.append(i * step);
     
     kmedoids_instance = kmedoids(input_data, initial_medoids);
     kmedoids_instance.process();
     clusters = kmedoids_instance.get_clusters();
     
     object_mark = [False] * number_objects;
     allocated_number_objects = 0;
     
     for cluster in clusters:
         for index_object in cluster: 
             assert (object_mark[index_object] == False);    # one object can be in only one cluster.
             
             object_mark[index_object] = True;
             allocated_number_objects += 1;
         
     assert (number_objects == allocated_number_objects);    # number of allocated objects should be the same.
Ejemplo n.º 11
0
def registration_icp(static,
                     moving,
                     points=20,
                     pca=True,
                     maxiter=100000,
                     affine=[0, 0, 0, 0, 0, 0, 1],
                     clustering=None,
                     medoids=[0, 1, 2],
                     k=3,
                     beta=999,
                     max_dist=40,
                     dist='pc'):
    options = {
        'maxcor': 10,
        'ftol': 1e-7,
        'gtol': 1e-5,
        'eps': 1e-8,
        'maxiter': maxiter
    }
    #options1 = {'xtol': 1e-6, 'ftol': 1e-6, 'maxiter': 1e6}
    if pca:
        moving = pca_transform_norm(static, moving, max_dist)
    else:
        mean_m = np.mean(np.concatenate(moving), axis=0)
        mean_s = np.mean(np.concatenate(static), axis=0)
        moving = [i - mean_m + mean_s for i in moving]

    original_moving = moving.copy()
    static = set_number_of_points(static, points)
    moving = set_number_of_points(moving, points)

    if clustering == 'kmeans':
        kmeans = KMeans(k).fit(np.concatenate(moving))
        idx = {i: np.where(kmeans.labels_ == i)[0] for i in range(k)}
        #dist = Clustering().distance_pc_clustering_mean
        if dist == 'pc':
            dist_fun = distance_pc_clustering_mean
        else:
            dist_fun = distance_tract_clustering_mean
        args = (static, moving, kmeans, idx, beta, max_dist)
        print('kmeans')
    elif clustering == 'kmedoids':
        k_medoids = kmedoids(np.concatenate(moving), medoids)
        k_medoids.process()
        #dist = Clustering().distance_pc_clustering_medoids
        if dist == 'pc':
            dist_fun = distance_pc_clustering_medoids
        else:
            dist_fun = distance_tract_clustering_medoids
        args = (static, moving, k_medoids, beta, max_dist)
        print('kmedoids')
    else:
        if dist == 'pc':
            dist_fun = distance_pc
            args = (static, moving, beta, max_dist)
        else:
            dist_fun = distance_mdf
            args = (static, moving)
        print('Without Clustering')

    'L-BFGS-B,Powell'
    m = Optimizer(dist_fun,
                  affine,
                  args=args,
                  method='L-BFGS-B',
                  options=options)
    #m = Optimizer(dist, affine,args=args,method='Powell',options=options1)
    m.print_summary()
    mat = compose_matrix44(m.xopt)
    return transform_streamlines(original_moving, mat)
def build_clusterer(data, nclusters, method, **kwargs):
    """
      A simple wrapper to various clustering approaches.
      Cluster the given data into nclusters by using the
      specified method. Depending on the specified method
      different packages may be required and different
      arguments are expected in the kwargs dict.
    """

    features = copy.deepcopy(kwargs["config"]["features"])
    print("{0} cluster features used {1}".format(INFO, features))

    windows = []

    has_gc = False
    if 'gc' in features:
        features.pop(features.index('gc'))
        has_gc = True

    has_mean_ratio = False
    if 'mean_ratio' in features:
        features.pop(features.index('mean_ratio'))
        has_mean_ratio = True

    has_wga_mean = False
    if 'wga_mean' in features:
        features.pop(features.index('wga_mean'))
        has_wga_mean = True

    has_no_wga_mean = False
    if 'no_wga_mean' in features:
        features.pop(features.index('no_wga_mean'))
        has_no_wga_mean = True

    for window in data:

        if has_wga_mean:
            window_values = [window.get_feature(feature='mean', name=WindowType.WGA)]
        elif has_no_wga_mean:
            window_values = [window.get_feature(feature='mean', name=WindowType.NO_WGA)]
        else:
            window_values = window.get_features(features=features)

        if has_gc:
            window_values.append(window.get_feature(feature='gc', name=WindowType.WGA))

        if has_mean_ratio:
            means = window.get_features(features=['mean'])
            ratio = (means[0] + 1) / (means[1] + 1)
            window_values.append(ratio)

        windows.append(window_values)

    if method == "kmeans":

        from sklearn.cluster import KMeans
        clusterer = KMeans(n_clusters=nclusters)

        clusterer.fit(windows)
        return clusterer
    elif method == "kmedoids":

        from pyclustering.cluster.kmedoids import kmedoids

        metric = get_distance_metric(dist_metric=kwargs["config"]["metric"].upper(),
                                     degree=kwargs["config"]["metric_degree"]
                                     if 'metric_degree' in kwargs["config"] else 0)

        initial_index_medoids = []
        if kwargs["config"]["init_cluster_idx"] == "random_from_data":
            import random

            for c in range(nclusters):
                idx = random.randint(0, len(windows) - 1)

                if idx in initial_index_medoids:

                    # try ten times before quiting
                    for time in range(10):
                        idx = random.randint(0, len(windows) - 1)

                        if idx in initial_index_medoids:
                            continue
                        else:
                            initial_index_medoids.append(idx)
                            break

                else:
                    initial_index_medoids.append(idx)
        else:
            initial_index_medoids = kwargs["config"]["init_cluster_idx"]

        clusterer = kmedoids(data=windows,
                             initial_index_medoids=initial_index_medoids,
                             metric=metric)
        clusterer.process()
        return clusterer, initial_index_medoids

    raise Error("Invalid clustering method: " + method)
Ejemplo n.º 13
0
def LPAM(graph, k=2, threshold=0.5, distance="amp", seed=0):
    """
    Link Partitioning Around Medoids

    :param graph: a networkx object
    :param k: number of clusters
    :param threshold: merging threshold in [0,1], default 0.5
    :param distance: type of distance: "amp" - amplified commute distance, or
    "cm" - commute distance, or distance matrix between all edges as np ndarray
    :param seed: random seed for k-medoid heuristic

    :return: NodeClustering object

    :Example:

    >>> from cdlib import algorithms
    >>> import networkx as nx
    >>> G = nx.karate_club_graph()
    >>> coms = algorithms.lpam(G, k=2, threshold=0.4, distance = "amp")

    :References:
    Link Partitioning Around Medoids https://arxiv.org/abs/1907.08731
    Alexander Ponomarenko, Leonidas Pitsoulis, Marat Shamshetdinov
    """
    def getCommuteDistace(G):
        """
        Returns commute distance matrix
        """
        verts = list(G.nodes)
        n = len(verts)
        vol = nx.volume(G, verts)

        # use NetworkX to get Laplacian
        L = nx.laplacian_matrix(G)
        L = L.todense()
        Gamma = L + (1 / n) * np.ones([n, n])
        CM = np.zeros([n, n])

        # get Moore-Penrose pseudo inverse
        Gamma_pinv = np.linalg.pinv(Gamma, rcond=1e-4)
        for i in range(n):
            for j in range(i + 1, n):
                CM[i, j] = vol * (Gamma_pinv[i, i] + Gamma_pinv[j, j] -
                                  2 * Gamma_pinv[i, j])
                CM[j, i] = CM[i, j]
        return CM

    def getAmp(G):
        """
        Returns amplified commute distance matrix
        """
        verts = list(G.nodes)
        n = len(verts)

        # get adj matrix
        A = nx.adjacency_matrix(G)
        A = A.todense()

        # use NetworkX to get Laplacian
        L = nx.laplacian_matrix(G)
        L = L.todense()
        Gamma = L + (1 / n) * np.ones([n, n])
        C_AMP = np.zeros([n, n])

        # get Moore-Penrose pseudo inverse
        Gamma_pinv = np.linalg.pinv(Gamma, rcond=1e-4)
        for i in range(n):
            for j in range(i + 1, n):
                r_ij = (Gamma_pinv[i, i] + Gamma_pinv[j, j] -
                        2 * Gamma_pinv[i, j])  # resistance dist
                d_i = G.degree(list(G.nodes())[i])
                d_j = G.degree(list(G.nodes())[j])
                if d_i != 0 and d_j != 0:
                    s_ij = r_ij - (1 / d_i) - (1 / d_j)
                    w_ij = A[i, j]
                    w_ii = A[i, i]
                    w_jj = A[j, j]
                    u_ij = (((2 * w_ij) / (d_i * d_j)) - (w_ii / (d_i**2)) -
                            (w_jj / (d_j**2)))
                    C_AMP[i, j] = s_ij + u_ij
                    C_AMP[j, i] = s_ij + u_ij
                else:
                    C_AMP[i, j] = np.NaN
                    C_AMP[j, i] = np.NaN
        return C_AMP

    line_graph = nx.line_graph(graph)
    D = None
    distance_name = distance
    if distance == "amp":
        D = getAmp(line_graph)
    if distance == "cm":
        D = getCommuteDistace
    if isinstance(distance, np.ndarray):
        D = distance
        distance_name = "custom"
    if D is None:
        raise TypeError(
            'Parameter distance should be "amp"/"cm", or numpy.ndarray')
    _n = len(line_graph.nodes())
    np.random.seed(0)
    initial_medoids = np.random.choice(_n, k, replace=False)
    kmedoids_instance = kmedoids(D,
                                 initial_medoids,
                                 data_type="distance_matrix")
    # run cluster analysis and obtain results
    kmedoids_instance.process()

    clusters = kmedoids_instance.get_clusters()

    final_clusters = {}
    for c_i, c in enumerate(clusters):
        for line_vertex in c:
            source, target = list(line_graph.nodes())[line_vertex]
            if source not in final_clusters:
                final_clusters[source] = []
            final_clusters[source].append(c_i)
            if target not in final_clusters:
                final_clusters[target] = []

            final_clusters[target].append(c_i)

    res_clusters = {}
    for v, l in final_clusters.items():
        degree = len(l)
        res = defaultdict(list)
        for x in l:
            res[x].append(x)
        covering = np.zeros(k)
        for c_i, _l in res.items():
            covering[c_i] = len(_l) / degree

        res_clusters[v] = covering

    _res_clusters = [[] for i in range(k)]

    for v, l in res_clusters.items():
        for i in range(k):
            if l[i] >= threshold:
                _res_clusters[i].append(v)

    return NodeClustering(
        communities=[c for c in _res_clusters if len(c) > 0],
        graph=graph,
        method_name="lpam " + distance_name,
        method_parameters={
            "k": k,
            "threshold": threshold,
            "distance": distance_name,
            "seed": seed,
        },
        overlap=True,
    )
Ejemplo n.º 14
0
    def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
        data_type = kwargs.get('data_type', 'points')
        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))

        original_data = read_sample(data_file)
        data = original_data

        if data_type == 'distance_matrix':
            data = calculate_distance_matrix(original_data, metric)

        reader = answer_reader(answer_file)

        amount_medoids = len(reader.get_clusters())

        initial_medoids = kmeans_plusplus_initializer(
            data, amount_medoids, **kwargs).initialize(return_index=True)
        kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore,
                                     **kwargs)

        kmedoids_instance.process()

        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        expected_length_clusters = sorted(reader.get_cluster_lengths())

        assertion.eq(len(expected_length_clusters), len(medoids))
        assertion.eq(len(data), sum([len(cluster) for cluster in clusters]))
        assertion.eq(sum(expected_length_clusters),
                     sum([len(cluster) for cluster in clusters]))

        unique_medoids = set()
        for medoid in medoids:
            assertion.false(
                medoid in unique_medoids,
                message="Medoids '%s' is not unique (actual medoids: '%s')" %
                (str(medoid), str(unique_medoids)))
            unique_medoids.add(medoid)

        unique_points = set()
        for cluster in clusters:
            for point in cluster:
                assertion.false(
                    point in unique_points,
                    message=
                    "Point '%s' is already assigned to one of the clusters." %
                    str(point))
                unique_points.add(point)

        assertion.eq(expected_length_clusters,
                     sorted([len(cluster) for cluster in clusters]))

        expected_clusters = reader.get_clusters()
        for actual_cluster in clusters:
            cluster_found = False
            for expected_cluster in expected_clusters:
                if actual_cluster == expected_cluster:
                    cluster_found = True

            assertion.true(
                cluster_found,
                message="Actual cluster '%s' is not found among expected." %
                str(actual_cluster))
Ejemplo n.º 15
0
    while len(medoidsToInit) < k_clusters:
        number = random.randrange(0, points_amount)
        if not number in medoidsToInit:
            medoidsToInit.append(number)

    return medoidsToInit


def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax):
	data = read_sample(str(root)+'\\'+filenameData)
    
    kClusters = canoc(data, kmin, kmax)
    
    initial_medoids = rci(data, kClusters).initialize()

    kmedoids_instance = kmedoids(data, initial_medoids)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    predicted = kmedoids_instance.predict(data)

    silhouetteScore = silhouette(data, clusters).process().get_score()
    meanSilhouetteScore = np.mean(silhouetteScore)
    #wlitCSV(silhouetteScore, filenameSilhouette, '', root)
    #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root)

    dbsScore = dbs(data, predicted)
    #witCSV(dbsScore, nameDBS, '', root)

    chsScore = chs(data, predicted)
    #witCSV(chsScore, nameCHS, '', root)
Ejemplo n.º 16
0
def process_kmedoids(sample):
    instance = kmedoids(sample, [ CURRENT_CLUSTER_SIZE * multiplier for multiplier in range(NUMBER_CLUSTERS) ])
    (ticks, _) = timedcall(instance.process)
    return ticks
            'wine_servings'] == column['spirit_servings']:
        nova_coluna_numerica.append(3)
        nova_coluna_nominal.append('none')

#Adicionando as novas colunas
bebida_mundo['most_consumed_number'] = nova_coluna_numerica
bebida_mundo['most_consumed_nominal'] = nova_coluna_nominal

#Criação de uma variável com as colunas númericas da quantidade ingerida de cada classe
bebida = bebida_mundo.iloc[:, 1:4].values

#Criação de uma variável com as classes dos registros
bebida_numero = bebida_mundo.iloc[:, 5].values

#Faz o processo de achar o kmedoids autmoaticamente (executar os 2 comandos simultaneamente)
cluster = kmedoids(bebida, [117, 68, 61])
cluster.get_medoids()

#Faz o processamento de clusterização
cluster.process()

#A variavel previsoes determina o número de cluster que a maquina ja processou anteriormente com o clusters.process()
previsoes = cluster.get_clusters()

#A variavel medoides determina o medoide (centro de um cluster)
medoides = cluster.get_medoids()

#Gera um gráfico com os 3 grupos, onde a * é o centro dos medoides (executar os 4 comandos simultâneos)
v = cluster_visualizer()
v.append_clusters(previsoes, bebida)
v.append_cluster(medoides, bebida, marker='*', markersize=100)
Ejemplo n.º 18
0
    nx.draw(G, node_color=colors, with_labels=True)
    #plt.show()

    np_MST = np.array(MST)
    line_count = []
    for i in range(noa):
        line = np.count_nonzero(np_MST[i])
        line += np.count_nonzero(np_MST[:, i])
        line_count.append(line)
    '''   
    add_all=0        
    for a in line_count:
        add_all+=a
    '''
    kmedoids_instance = kmedoids(distnace_matrix,
                                 [random.randrange(0, noa) for i in range(10)],
                                 data_type='distance_matrix')
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()

    one_edge_in_each_cluster = []

    for cluster in clusters:
        one_edge = []
        for i in cluster:
            if line_count[i] == 1:
                one_edge.append(i)
        one_edge_in_each_cluster.append(one_edge)

    final_10_stocks = []
Ejemplo n.º 19
0
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES
# Load list of points for cluster analysis.
sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
lines = open("t4.8k", "r")
inp = []
for line in lines:
    cords = line.split()
    if len(cords) != 2:
        continue
    inp.append([float(cords[0]), float(cords[1])])

# Set random initial medoids.
initial_medoids = [1, 800, 1400, 672, 763, 926]

# Create instance of K-Medoids algorithm.
kmedoids_instance = kmedoids(inp, initial_medoids)

# Run cluster analysis and obtain results.
kmedoids_instance.process()
clusters = kmedoids_instance.get_clusters()

# Display clusters.
visualizer = cluster_visualizer()
visualizer.append_clusters(clusters, inp)
visualizer.show()
Ejemplo n.º 20
0
# define K initial medoids randomly
print('Choosing', K, 'initial medoids randomly...')
start = time.time()
initial_medoids = [
    int(np.random.uniform(0, distance_matrix.shape[0])) for i in range(K)
]
stop = time.time()
print('Random medoids selected', '[', round(stop - start, 2), 'seconds ]')
print('Random medoids are', initial_medoids)

# execute the K-Medoids algorithm
print('Creating Kmediod instance...')
start = time.time()
kmedoids_instance = kmedoids(distance_matrix,
                             initial_medoids,
                             data_type='distance_matrix')
stop = time.time()
print('Created Kmedoid instance', '[', round(stop - start, 2), 'seconds ]')

# get actual K medoids and clusters
print('Get clusters and medoids using K-Medoids algorithm...')
start = time.time()
kmedoids_instance.process()
clusters = kmedoids_instance.get_clusters()
medoids = kmedoids_instance.get_medoids()
stop = time.time()
print('Found clusters and medoids', '[', round(stop - start, 2), 'seconds ]')

print('Final medoids are', medoids)
Ejemplo n.º 21
0
def run_kmedoids(element_maps_with_binary_lead_l, binarized_lead_l_map,
                 num_clusters, num_clusters_in_a_cluster):
    '''
    This function creates 2 clusters based on a binarized lead_l map, one with areas where lead is present and the other with
    area where lead is not present (under the threshold level)
    '''
    row = binarized_lead_l_map.shape[0]
    col = binarized_lead_l_map.shape[1]

    combined_maps = np.zeros((row, col))

    for map in list(element_maps_with_binary_lead_l.keys()):
        combined_maps = np.dstack(
            (combined_maps, element_maps_with_binary_lead_l[map]))

    unnormalized_data = combined_maps[:, :, 1:]
    num_chnl = unnormalized_data.shape[2]

    #this part normalizes each map in the patch
    normalized_data = np.zeros(
        (row, col, num_chnl)
    )  #each patch is normalized with different minimum and maximum values
    for i in range(num_chnl):
        normalized_data[:, :, i] = normalize(unnormalized_data[:, :, i])

    #reshapes the data to run kmeans
    data2D = np.reshape(normalized_data, (row * col, num_chnl))
    #data2D_PCA = PCA(data2D, 1) #reduces  data2D to have 1 dimension only so that it can be given to the kmedoids function
    #data 2d size : 1345410,11
    initial_index_medoids = [1, 30000]
    kmed_round1 = kmed.kmedoids(data2D, initial_index_medoids)
    kmed_round1.process()
    result_1 = kmed_round1.get_clusters()
    classified_result_1 = np.full((row * col), 255)
    for i in range(len(result_1)):
        for j in range(len(result_1[i])):
            classified_result_1[result_1[i][j]] = i
    classified_result_1 = np.reshape(classified_result_1, (row, col))

    cluster_dict = dict()
    pixel_location_dict = dict()

    # stores in each of the two dictionaries above classification information for each pixel in the first
    # round of clustering and the location of the pixel
    for i in range(classified_result_1.shape[0]):
        for j in range(classified_result_1.shape[1]):
            if classified_result_1[i, j] not in cluster_dict:
                cluster_dict[classified_result_1[i, j]] = [
                    [] for num in range(num_chnl)
                ]
                pixel_location_dict[classified_result_1[i, j]] = []
            pixel_location_dict[classified_result_1[i, j]].append([i, j])
            for k in range(num_chnl):
                cluster_dict[classified_result_1[i, j]][k].append(
                    normalized_data[i, j, k])

    #runs the second round of classification on each of the clusters formed from the first classification
    for cluster in list(cluster_dict.keys()):
        cluster_dict[cluster] = np.array(cluster_dict[cluster],
                                         dtype='float32')
        cluster_dict[cluster] = np.transpose(
            cluster_dict[cluster]
        )  #each value for a cluster is a row*col , num_chnl
        clusters_in_a_cluster = cluster_clusters(cluster_dict[cluster],
                                                 cluster,
                                                 num_clusters_in_a_cluster)
        cluster_dict[cluster] = clusters_in_a_cluster

    #builds the classification map based on the second classification results
    for cluster in list(cluster_dict.keys()):
        classification_map = np.full((row, col), 255)
        for i in range(len(cluster_dict[cluster])):
            idx_pair = pixel_location_dict[cluster][i]
            row_idx = idx_pair[0]
            col_idx = idx_pair[1]
            classification_map[row_idx, col_idx] = cluster_dict[cluster][i]
        build_map(classification_map, cluster, num_clusters_in_a_cluster)

    return result_1
Ejemplo n.º 22
0
def compute_kmedoids(bboxes,
                     cls,
                     option='pyclustering',
                     indices=15,
                     max_clusters=35,
                     max_limit=5000):
    print("Performing clustering using", option)
    clustering = [{} for _ in range(indices)]

    bboxes = centralize_bbox(bboxes)

    # subsample the number of bounding boxes so that it can fit in memory and is faster
    if bboxes.shape[0] > max_limit:
        sub_ind = np.random.choice(np.arange(bboxes.shape[0]),
                                   size=max_limit,
                                   replace=False)
        bboxes = bboxes[sub_ind]

    distances_cache = Path('distances_{0}.jbl'.format(cls))
    if distances_cache.exists():
        print("Loading distances")
        dist = joblib.load(distances_cache)
    else:
        dist = compute_distances(bboxes)
        joblib.dump(dist, distances_cache, compress=5)

    if option == 'pyclustering':
        for k in range(indices, max_clusters + 1):
            print(k, "clusters")

            initial_medoids = np.random.choice(bboxes.shape[0],
                                               size=k,
                                               replace=False)

            kmedoids_instance = kmedoids(dist,
                                         initial_medoids,
                                         ccore=True,
                                         data_type='distance_matrix')

            print("Running KMedoids")
            t1 = datetime.now()
            kmedoids_instance.process()
            dt = datetime.now() - t1
            print("Total time taken for clustering {k} medoids: {0}min:{1}s".
                  format(dt.seconds // 60, dt.seconds % 60, k=k))

            medoids_idx = kmedoids_instance.get_medoids()
            medoids = bboxes[medoids_idx]

            clustering.append({
                'n_clusters': k,
                'medoids': medoids,
                'class': cls
            })

    elif option == 'pyclust':

        for k in range(indices, max_clusters + 1):
            print(k, "clusters")
            kmd = KMedoids(n_clusters=k,
                           distance=rect_dist,
                           n_trials=1,
                           max_iter=2)
            t1 = datetime.now()
            kmd.fit(bboxes)
            dt = datetime.now() - t1
            print("Total time taken for clustering {k} medoids: {0}min:{1}s".
                  format(dt.seconds // 60, dt.seconds % 60, k=k))

            medoids = kmd.centers_

            clustering.append({
                'n_clusters': k,
                'medoids': medoids,
                'class': cls
            })

    elif option == 'local':

        for k in range(indices, max_clusters + 1):
            print(k, "clusters")
            curr_medoids, cluster_idxs = kMedoids(dist, k=k)
            medoids = []
            for m in curr_medoids:
                medoids.append(bboxes[m, :])
            clustering.append({
                'n_clusters': k,
                'medoids': medoids,
                'class': cls
            })

    return clustering
Ejemplo n.º 23
0
def main():
    st.title('Similarity Recommender')
    st.markdown("---")
    st.text("This is a lead generator according to a company's portfolio.")

    Choices = st.sidebar.selectbox(
        "Do you have a client you wish to generate leads from?",
        [" ", "Yes", "No"])

    if Choices == "Yes":
        st.sidebar.title("Lead Generator")
        st.sidebar.markdown("---")
        loading_portfolios = st.sidebar.text('Loading the portfolios...')
        portfolios = load_portfolios()
        loading_portfolios.text(
            'Loading complete!\nNow you can start using the app!')
        portfolio = st.sidebar.selectbox(
            "Select the portfolio of the company you want to look for leads.",
            list(portfolios.keys()))

        if portfolios[portfolio] is not None:
            load_database = st.text('Loading the database...')
            market_ID = load_market()
            load_database.text('Loading complete!')
            st.subheader("Market Database")
            st.dataframe(market_ID.head(5))
            df_target = portfolios[portfolio]
            values = df_target.index.tolist()
            options = df_target['id'].tolist()
            dic = dict(zip(options, values))
            Id = st.selectbox('Choose a client',
                              options,
                              format_func=lambda x: dic[x])
            st.write(" **Id**: " + Id)
            n_top = st.slider(
                'Select the number of leads you want to look for', 0, 5)
            st.text(
                'For showcase purposes the maximum amount of leads was set to 5.'
            )
            if n_top > 0:
                data_load_state = st.text(
                    'Searching for the nearest neighbours, this may take a while...'
                )
                NN_ID, leads = neighbours_search(Id, market_ID, df_target,
                                                 n_top)
                data_load_state.text('Found them!')
                for i in range(0, n_top):
                    st.subheader("Lead " + str(i + 1))
                    st.markdown('**Index**: ' + str(NN_ID.get('index')[i]))

                    st.markdown('**Id**: ' + str(leads[i]))

                    st.markdown('**Dissimalirity**: ' +
                                str(round(NN_ID.get('values')[i], 5)))

    if Choices == "No":
        st.sidebar.title("Cluster Generator")
        st.sidebar.markdown("---")
        loading_portfolios = st.sidebar.text('Loading the portfolios...')
        portfolios = load_portfolios()
        loading_portfolios.text(
            'Loading complete!\nNow you can start using the app!')
        portfolio = st.sidebar.selectbox(
            "Select the portfolio of the company to generate clusters.",
            list(portfolios.keys()))

        if portfolios[portfolio] is not None:
            load_database = st.text('Loading the database...')
            market_ID = load_market()
            load_database.text('Loading complete!')
            st.subheader("Market Database")
            st.dataframe(market_ID.head(5))

            calculating = st.text(
                'Calculating the dissimilarity matrix! This may take a while...'
            )
            dissimilarity_matrix = calculate_distance(portfolios[portfolio])
            calculating.text('Phew, we finally finished the calculus!')
            X = dissimilarity_matrix

            metrics = st.text('Generating plots for evaluation metrics...')

            # creating the lists we'll want to save values to
            medoids_per_k = []  # medoids for each number of clusters
            clusters_per_k = []  # clusters for each number of clusters
            k_scores = []  # average silhouette score of k clusters
            wss = []  # the sum of dissimilarity of each cluster

            random.seed(42)
            for i, k in enumerate([2, 3, 4, 5, 6, 7]):

                # the medoids algorithm requires an initial point to start so we're setting it here
                initial_medoids_km = random.sample(
                    range(1, portfolios[portfolio].shape[0]), k)

                # Run the Kmeans algorithm
                km = kmedoids(X,
                              initial_medoids_km,
                              data_type='distance_matrix')
                km.process()

                # saving the created clusters into a list
                clusters_km = km.get_clusters()
                clusters_per_k.append(clusters_km)

                # saving the medoids that were found
                medoids_km = km.get_medoids()

                # saving the medoids that were found per each number of clusters into a list
                medoids_per_k.append(medoids_km)

                # creating a dataframe with the labels of each cluster
                labels_km = pd.Series(0,
                                      index=range(
                                          0, portfolios[portfolio].shape[0]))
                for i in range(0, len(clusters_km)):
                    for n in range(0, len(clusters_km[i])):
                        index = clusters_km[i][n]
                        labels_km.iloc[index] = i

                # getting the sum of the dissimilarity per cluster
                clusters_distances = []
                for n in range(0, len(clusters_km)):
                    clusters_distances.append(X[medoids_km[n]][labels_km[
                        labels_km == n].index].sum())

                # total sum of the dissimilarity
                wss.append(sum(clusters_distances))

                # Get silhouette samples
                silhouette_vals = silhouette_samples(X,
                                                     labels_km,
                                                     metric='precomputed')

                # Silhouette plot
                fig = go.Figure()
                fig.update_layout(title={
                    'text': 'Silhouette plot for ' + str(k) + ' clusters',
                    'x': 0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'
                },
                                  xaxis_title='Silhouette coefficient values',
                                  yaxis_title='Cluster labels',
                                  font=dict(family="Courier New, monospace",
                                            size=16,
                                            color="RebeccaPurple"),
                                  autosize=False,
                                  width=1000,
                                  height=600,
                                  margin=dict(l=50, r=50, b=100, t=100, pad=4),
                                  paper_bgcolor="LightGrey")
                y_lower, y_upper = 0, 0
                annotations = []
                for i, cluster in enumerate(np.unique(labels_km)):
                    cluster_silhouette_vals = silhouette_vals[labels_km ==
                                                              cluster]
                    cluster_silhouette_vals.sort()
                    y_upper += len(cluster_silhouette_vals)

                    fig.add_trace(
                        go.Bar(x=cluster_silhouette_vals,
                               y=np.array((range(y_lower, y_upper))),
                               name=str(i + 1),
                               orientation='h',
                               showlegend=False))

                    annotations.append(
                        dict(x=-0.03,
                             y=(y_lower + y_upper) / 2,
                             text=str(i + 1),
                             showarrow=False))
                    y_lower += len(cluster_silhouette_vals)
                fig.update_layout(annotations=annotations)

                # Get the average silhouette score
                avg_score = np.mean(silhouette_vals)

                # saving the average silhouette score of k clusters in a list
                k_scores.append(avg_score)

                # plottting the average silhouette score
                fig.update_layout(shapes=[
                    dict(type='line',
                         yref='paper',
                         y0=0,
                         y1=1,
                         xref='x',
                         x0=avg_score,
                         x1=avg_score,
                         line=dict(color='green', width=2, dash='dash'))
                ])
                fig.update_yaxes(showticklabels=False)

                # plotting the graphs created in streamlit
                st.plotly_chart(fig)

            fig_wss = go.Figure()
            fig_wss.update_layout(title={
                'text': 'Dissimilarity plot - The Elbow Method',
                'x': 0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            },
                                  xaxis_title='Number of Clusters',
                                  yaxis_title='Dissimilarity',
                                  font=dict(family="Courier New, monospace",
                                            size=16,
                                            color="RebeccaPurple"),
                                  autosize=False,
                                  width=1000,
                                  height=600,
                                  margin=dict(l=50, r=50, b=100, t=100, pad=4),
                                  paper_bgcolor="LightGrey")
            fig_wss.add_trace(
                go.Scatter(x=list(range(2, 8)), y=wss, mode='lines+markers'))
            st.plotly_chart(fig_wss)

            metrics.text("Metrics' plots generated.")

            st.markdown(
                "Now comes the fun part, I am going to challenge you to choose the best "
                "number of clusters!<br/>"
                "However I am going to help you by giving you a few tips:\n"
                " * The Silhouette Coefficient is bounded between -1 for incorrect clustering "
                "and +1 for highly dense "
                "clustering.<br/>"
                "Scores around zero indicate overlapping clusters.\n"
                " * You'll want to look for a couple of things in the Silhouette plot:\n"
                "   * The plot with the less amount of negative values, representing incorrect "
                "labeled clients.\n"
                "   * The plot where the clusters have a greater area above the mean silhouette score, "
                "which means clusters with higher density, in another words closer clients (or alike).\n"
                " * The elbow method consists in finding a inflection point in the plot. "
                "That is if you picture a bent arm you want to look at the point where the elbow is.<br/>\n"
                "I'll help you with an example: from 2 clusters to 3 the dissimilarity drops by 20k,"
                " but from 3 to 4 only drops 5k. "
                "This means from 3 clusters onwards the dissimilarity 'gains' "
                "by having more clusters isn't significative.",
                unsafe_allow_html=True)
            list_clusters = [0, 2, 3, 4, 5, 6, 7]
            number_clusters = st.selectbox(
                "How many clusters do you want to use?", list_clusters)
            if number_clusters is not 0:
                graphics = st.text("Creating shiny plots...")
                medoids = medoids_per_k[
                    number_clusters -
                    2]  # The medoids and clusters lists starts at index 0 which
                # is with 2 clusters, and finishes at 5, 7 clusters thus the -2
                clusters = clusters_per_k[number_clusters - 2]
                fit_umap = umap.UMAP(n_neighbors=14,
                                     min_dist=0.1,
                                     n_components=3,
                                     metric='dice',
                                     random_state=42)
                p_umap = fit_umap.fit_transform(
                    portfolios[portfolio].drop(columns=['id']))

                # Visualising the clusters

                fig_umap = go.Figure()
                for i in range(0, number_clusters):
                    fig_umap.add_trace(
                        go.Scatter3d(x=p_umap[clusters[i], 0],
                                     y=p_umap[clusters[i], 1],
                                     z=p_umap[clusters[i], 2],
                                     name='Cluster ' + str(i),
                                     mode='markers'))

                fig_umap.add_trace(
                    go.Scatter3d(x=p_umap[medoids, 0],
                                 y=p_umap[medoids, 1],
                                 z=p_umap[medoids, 2],
                                 name='Medoids',
                                 mode='markers',
                                 marker_color="rgb(255,255,0)",
                                 marker=dict(size=16)))

                fig_umap.update_layout(title={
                    'text': 'Clusters with the Dice Distance',
                    'x': 0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'
                },
                                       font=dict(
                                           family="Courier New, monospace",
                                           size=16,
                                           color="RebeccaPurple"),
                                       autosize=False,
                                       width=1000,
                                       height=600,
                                       margin=dict(l=50,
                                                   r=50,
                                                   b=100,
                                                   t=100,
                                                   pad=4))

                st.plotly_chart(fig_umap)

                fit_umap_man = umap.UMAP(n_neighbors=14,
                                         min_dist=0.1,
                                         n_components=3,
                                         metric='manhattan',
                                         random_state=42)
                p_umap_man = fit_umap_man.fit_transform(
                    portfolios[portfolio].drop(columns=['id']))

                fig_umap_man = go.Figure()
                for i in range(0, number_clusters):
                    fig_umap_man.add_trace(
                        go.Scatter3d(x=p_umap_man[clusters[i], 0],
                                     y=p_umap_man[clusters[i], 1],
                                     z=p_umap_man[clusters[i], 2],
                                     name='Cluster ' + str(i),
                                     mode='markers'))

                fig_umap_man.add_trace(
                    go.Scatter3d(x=p_umap_man[medoids, 0],
                                 y=p_umap_man[medoids, 1],
                                 z=p_umap_man[medoids, 2],
                                 name='Medoids',
                                 mode='markers',
                                 marker_color="rgb(255,255,0)",
                                 marker=dict(size=16)))

                fig_umap_man.update_layout(title={
                    'text': 'Clusters with the Manhattan Distance',
                    'x': 0.5,
                    'xanchor': 'center',
                    'yanchor': 'top'
                },
                                           font=dict(
                                               family="Courier New, monospace",
                                               size=16,
                                               color="RebeccaPurple"),
                                           autosize=False,
                                           width=1000,
                                           height=600,
                                           margin=dict(l=50,
                                                       r=50,
                                                       b=100,
                                                       t=100,
                                                       pad=4))

                st.plotly_chart(fig_umap_man)
                graphics.text('3D clusters visualization complete!')
                st.markdown(
                    "**Developer's notes**: <br/>UMAP doesn't have the Gower distance in-built,"
                    " however it has the Dice and Manhattan distances,"
                    "which are the distances used by the Gower distance.<br/>"
                    "So I have shown the 3D visualization using both distances instead of the distance used"
                    "to finds clusters.<br/>"
                    "A future development would be coding the Gower distance as a custom "
                    "distance in the UMAP method.",
                    unsafe_allow_html=True)

                selection = st.selectbox(
                    'Choose a representative client(Medoid)', medoids)
                Id = portfolios[portfolio].loc[selection, 'id']
                st.write("**Client " + str(selection) + " ID:** " + Id)
                n_top = st.slider(
                    'Select the number of leads you want to look for', 0, 5)
                st.text(
                    'For showcase purposes the maximum amount of leads was set to 5.'
                )
                df_target = portfolios[portfolio]
                if n_top > 0:
                    data_load_state = st.text(
                        'Searching for the nearest neighbours, this may take a while...'
                    )
                    NN_ID, leads = neighbours_search(Id, market_ID, df_target,
                                                     n_top)
                    data_load_state.text('Found them!')
                    for i in range(0, n_top):
                        st.subheader("Lead " + str(i + 1))
                        st.markdown('**Index**: ' + str(NN_ID.get('index')[i]))

                        st.markdown('**Id**: ' + str(leads[i]))

                        st.markdown('**Dissimalirity**: ' +
                                    str(round(NN_ID.get('values')[i], 5)))

    st.sidebar.title("Useful Links")
    st.sidebar.markdown("---")
    st.sidebar.markdown("[Github]"
                        "(https://github.com/Rpinto02/Similarity_Recommender)")
    st.sidebar.markdown("[Linkedin]" "(https://www.linkedin.com/in/rpinto02/)")
    st.sidebar.markdown("[Codenation]" "(https://codenation.dev)")
Ejemplo n.º 24
0
    def fit(self):
        Final_cluster = []
        Temp_cluster = []
        ToCheck_cluster = []
        #threshold=0.9529#0.01 #0.5
        K = 4  #int(y.max()-y.min()/threshold)
        Final_medoids = []
        Check_medoids = []
        Temp_medoids = []

        kmedoids_instance = kmedoids(self.X.values,
                                     self.initial_medoids,
                                     ccore=False,
                                     data_type='distance_matrix')
        # run cluster analysis and obtain results
        kmedoids_instance.process()
        ToCheck_cluster = kmedoids_instance.get_clusters()
        Check_medoids = kmedoids_instance.get_medoids()
        OC = []

        for i in range(len(Check_medoids)):
            STD = np.std(self.Y.iloc[ToCheck_cluster[i]])  #it is a number
            if STD <= self.threshold:
                Final_cluster.append(ToCheck_cluster[i])
                Final_medoids.append(Check_medoids[i])
            else:
                Temp_cluster.append(ToCheck_cluster[i])
                Temp_medoids.append(Check_medoids[i])
        ToCheck_cluster = Temp_cluster
        Check_medoids = Temp_medoids

        while ToCheck_cluster:
            L = len(ToCheck_cluster)
            Temp_cluster = []
            Temp_medoids = []
            for i in range(0, L):
                list = ToCheck_cluster[i]
                if len(list) == 0: continue
                if len(list) <= 2:
                    Final_cluster.append(list)
                    Final_medoids.append(Check_medoids[i])
                    continue

                OC = self.Y.iloc[list]
                STD = np.std(OC)
                if STD <= self.threshold:
                    Final_cluster.append(list)
                    Final_medoids.append(Check_medoids[i])
                else:
                    data = self.X.iloc[list, list]
                    new_medoids = random.sample(range(len(list)), K)
                    kmedoids_instance = kmedoids(data.values,
                                                 new_medoids,
                                                 ccore=False,
                                                 data_type='distance_matrix')
                    # run cluster analysis and obtain results
                    kmedoids_instance.process()
                    cluster = kmedoids_instance.get_clusters()

                    for i in range(len(cluster)):
                        for j in range(len(cluster[i])):
                            cluster[i][j] = list[cluster[i][j]]
                        Temp_cluster.append(cluster[i])

                    medoids = kmedoids_instance.get_medoids()
                    for i in range(len(medoids)):
                        Temp_medoids.append(list[medoids[i]])

            ToCheck_cluster = Temp_cluster
            Check_medoids = Temp_medoids

        return Final_medoids, Final_cluster
Ejemplo n.º 25
0
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES
fig, axs = plt.subplots(2, 4,figsize=(14, 10))
from sklearn.cluster import KMeans

colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen']


y_means = np.zeros(len(X))

for ncenters, ax in enumerate(axs.reshape(-1), 1):
    initial_medoids = [i for i in range(ncenters)]
    kmedoids_instance = kmedoids(X, initial_medoids)
    # Run cluster analysis and obtain results.
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    # Show allocated clusters.
    medoids = kmedoids_instance.get_medoids()
    for i in range(len(clusters)):
        for j in range(len(clusters[i])):
            x_index = clusters[i][j]
            y_means[x_index] = i
    # Нарисовали точки по кластерам
    ax.set_title('Centers = {0}'.format(ncenters))
    ax.scatter(X[:, 0], X[:, 1], c=y_means , s=50, cmap='viridis')
    
    centers = []
    for i in range(len(medoids)):
Ejemplo n.º 26
0
    # Feature extraction for each image
    #compute_BOW_descriptors()

    # Cluster images with kmedoids
    X = pd.read_csv(os.path.join(competitors_dir, "bow_images.pd"),
                    index_col=0)

    # Select interesting images
    with open(COCO_train_graphs_subset_json_path) as f:
        graphs = json.load(f)
    selected_names = [f"{g['graph']['name']:012d}.jpg" for g in graphs]
    X = X.loc[selected_names]

    K = 9
    km = kmedoids(X.to_numpy(), np.random.randint(0, len(X), K))
    start_time = datetime.now()
    print("Start clustering process.")
    km.process()
    med = km.get_medoids()
    end_time = datetime.now()
    print('Done. Duration: ' + str(end_time - start_time))

    images = []
    for m in med:
        img = X.iloc[m].name
        images.append(img)
    print(images)

    with open(os.path.join(competitors_dir, out_file), 'w') as f:
        for el in images:
Ejemplo n.º 27
0
 def testCoreInterfaceIntInputData(self):
     kmedoids_instance = kmedoids([[1], [2], [3], [20], [21], [22]], [2, 5],
                                  0.025, True)
     kmedoids_instance.process()
     assert len(kmedoids_instance.get_clusters()) == 2
Ejemplo n.º 28
0
import matplotlib.pyplot as plt
from scipy.stats import stats
import sklearn.datasets as datasets
from sklearn.metrics import accuracy_score, confusion_matrix

# pip install pyclustering
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer

iris = datasets.load_iris()
data = iris.data[:, 0:2]
classes = iris.target

# 0:2 -> col 0 and 1
# 3, 12, 20 center of cluster sugest
model_cluster = kmedoids(data, [3, 12, 20])
model_cluster.get_medoids()  # [3, 12, 20]
model_cluster.process()

# lists of previsions
previsoes = model_cluster.get_clusters()

# index of medoids find
medoids = model_cluster.get_medoids()  # [7, 67, 112]

v = cluster_visualizer()
v.append_clusters(previsoes, data)
v.append_cluster(medoids, data=data, marker='*', markersize=15)
v.show()

n_lst = []
Ejemplo n.º 29
0
 def __init__(self, data, initial_medoids):
     self.kmedoids_ = kmedoids(data, initial_medoids)
Ejemplo n.º 30
0
from pyclustering.cluster import cluster_visualizer, cluster_visualizer_multidim
from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES, FAMOUS_SAMPLES
# Load list of points for cluster analysis.
# sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)
sample = read_sample(FAMOUS_SAMPLES.SAMPLE_IRIS)
lines = open("t4.8k", "r")
inp = []
for line in lines:
    cords = line.split()
    if len(cords) != 2:
        continue
    inp.append([float(cords[0]), float(cords[1])])

# Set random initial medoids.
# initial_medoids = [1, 800, 1400, 672, 763, 926]
initial_medoids = [1, 500]

# Create instance of K-Medoids algorithm.
# kmedoids_instance = kmedoids(inp, initial_medoids)
kmedoids_instance = kmedoids(sample, initial_medoids)

# Run cluster analysis and obtain results.
kmedoids_instance.process()
clusters = kmedoids_instance.get_clusters()

# Display clusters.
visualizer = cluster_visualizer_multidim()
# visualizer.append_clusters(clusters, inp)
visualizer.append_clusters(clusters, sample)
visualizer.show()
Ejemplo n.º 31
0
def process_kmedoids(sample):
    instance = kmedoids(sample, [ CURRENT_CLUSTER_SIZE * multiplier for multiplier in range(NUMBER_CLUSTERS) ])
    (ticks, _) = timedcall(instance.process)
    return ticks
def main(args):
    with open(
            "saved/{0}_{1}_{2}".format(args.name, args.suffix,
                                       args.cluster_label), "rb") as handle:
        plot_data = pickle.load(handle)

    K = len(plot_data["label"])

    # print("full data len: ", K)
    print("dataset name: ", args.name)
    print("approach: ", args.approach)

    tsne_data = np.hstack([
        np.array(plot_data["x"]).reshape(-1, 1),
        np.array(plot_data["y"]).reshape(-1, 1)
    ])

    softmax_pca_data = softmax(plot_data["pca"], axis=1)
    softmax_tsne_data = softmax(tsne_data, axis=1)

    dp = [[0.0 for j in range(K)] for i in range(K)]
    for i in range(K):
        for j in range(i + 1, K):

            if args.approach == "euclid-pca":
                dist = distance.euclidean(plot_data["pca"][i],
                                          plot_data["pca"][j])
            elif args.approach == "euclid-tsne":
                dist = distance.euclidean(tsne_data[i], tsne_data[j])
            elif args.approach == "cosine-pca":
                dist = distance.cosine(plot_data["pca"][i],
                                       plot_data["pca"][j])
            elif args.approach == "cosine-tsne":
                dist = distance.cosine(tsne_data[i], tsne_data[j])
            elif args.approach == "kldiv-pca":
                dist = sum(
                    rel_entr(softmax_pca_data[i], softmax_pca_data[j]) +
                    rel_entr(softmax_pca_data[j], softmax_pca_data[i]))
            elif args.approach == "kldiv-tsne":
                dist = sum(
                    rel_entr(softmax_tsne_data[i], softmax_tsne_data[j]) +
                    rel_entr(softmax_tsne_data[j], softmax_tsne_data[i]))

            dp[i][j] = dp[j][i] = dist
    print("created dist matrix")

    labels = plot_data["label"][:K]

    cluster_count = len(set(labels))
    print("num clusters: ", cluster_count)
    inits = rng.choice(K, size=cluster_count, replace=False)
    # print("cluster inits:", inits)
    print("max iterations: ", args.itermax)
    km_instance = kmedoids(dp,
                           inits,
                           data_type="distance_matrix",
                           itermax=args.itermax)
    # print("running kmedoids")
    km_instance.process()
    # print("getting clusters")
    clusters = km_instance.get_clusters()
    predicts = [-1 for i in range(K)]
    for index, clust in enumerate(clusters):
        for pt in clust:
            predicts[pt] = index

    # print("cluster allocations: ", clusters)
    # print("predictions: ", predicts)
    # print("true labels: ", labels)
    score = adjusted_rand_score(labels, predicts)
    print("adj. rand score: ", score)
    return score
Ejemplo n.º 33
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids,
                                        expected_cluster_length, metric,
                                        ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(
                    sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data,
                                         initial_medoids,
                                         0.001,
                                         ccore=ccore_flag,
                                         metric=metric,
                                         data_type=data_type,
                                         itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            for cluster in clusters:
                if len(cluster) == 0:
                    continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Ejemplo n.º 34
0
for i in np.arange(m.shape[0]):
    allfeatures[df3['product_title'].
                agg(lambda x: sum([y == m[i] for y in x.split()]) > 0), i] = 1

df4 = df3.iloc[:, 0:1]
Complete_data = pd.concat([df4, pd.DataFrame(allfeatures)], 1)

cm = Complete_data.values.tolist()
initial_medoids = [1, 2, 3, 4, 5, 6, 7, 8, 9]
#initial_medoids=[1,2,3,4,5]

#metric=distance.euclidean
metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
#metric = DistanceMetric.get_metric('')

kmedoids_instance = kmedoids.kmedoids(cm, initial_medoids, metric=metric)
kmedoids_instance.process()
cl = kmedoids_instance.get_clusters()

zero = pd.DataFrame(cl[0])
zero['label'] = 0
one = pd.DataFrame(cl[1])
one['label'] = 1
two = pd.DataFrame(cl[2])
two['label'] = 2
three = pd.DataFrame(cl[3])
three['label'] = 3
four = pd.DataFrame(cl[4])
four['label'] = 4
five = pd.DataFrame(cl[5])
five['label'] = 5
Ejemplo n.º 35
0
Seeds = [
    5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700,
    800, 900, 1000
]
for Nseeds in Seeds:
    print("----" + str(Nseeds) + "----")
    #
    initial_medoids = [random.randint(0, len(names_dt)) for i in range(Nseeds)]
    #
    metric = distance_metric(type_metric.USER_DEFINED,
                             func=descriptors_similarity)
    #
    #initial_medoids = kmedoids_plusplus_initializer(dt, 8).initialize()
    # Create instance of K-Means algorithm with prepared centers.
    kmedoids_instance = kmedoids(dt,
                                 initial_medoids,
                                 metric=metric,
                                 itermax=10)
    # Run cluster analysis and obtain results.

    start = time.time()
    print("hello")
    kmedoids_instance.process()
    end = time.time()
    print(end - start)
    #
    clusters = kmedoids_instance.get_clusters()
    final_medoids = kmedoids_instance.get_medoids()
    #final_centers = kmedoids_instance.get_centers()
    # run cluster analysis and obtain results
    #
    #names_dt[clusters[0]]
Ejemplo n.º 36
0
def create_clusters(df,
                    keys,
                    score,
                    resp,
                    ncluster=20,
                    w=None,
                    type='kmenoids',
                    tolerance=0.001):
    from pyclustering.cluster.kmedoids import kmedoids
    from sklearn.cluster import AgglomerativeClustering

    grouped = df.groupby(keys)[score].mean()

    if w is not None:
        grouped_w = df.groupby(keys)[w].sum()
        df_vs = keys + [resp, w]
    else:
        w = 'count'
        grouped_w = df.groupby(keys)[score].count()
        grouped_w.name = w
        df_vs = keys + [resp]

    if type == 'kmenoids':
        calculate_init = pd.concat([grouped, grouped_w], axis=1)
        calculate_init['index'] = list(range(len(grouped)))
        calculate_init = calculate_init.sort_values(by=score)
        calculate_init['cw'] = calculate_init[w].cumsum().div(
            calculate_init[w].sum())
        quantiles = np.linspace(0, 1, ncluster + 2)[1:-1]
        init_centroid = list(
            map(
                lambda x: calculate_init[calculate_init['cw'] > x]['index'].
                iloc[0], quantiles))
        clustering = kmedoids(grouped.values.reshape(-1, 1).tolist(),
                              init_centroid,
                              tolerance=tolerance)
        clustering.process()
        clusters = clustering.get_clusters()
        cluster_mapping = {
            index: n
            for n, instance in enumerate(clusters) for index in instance
        }

    elif type == 'ward':
        ff = np.average  #lambda x: np.average(x, w=df[w].iloc[x.index])
        clusters = AgglomerativeClustering(n_clusters=ncluster,
                                           pooling_func=ff)
        cluster_values = clusters.fit_predict(grouped.values.reshape(-1, 1))
        cluster_mapping = dict(zip(range(0, len(grouped)), cluster_values))
        clusters = range(0, ncluster)

    grouped = grouped.to_frame().reset_index()
    grouped['cluster'] = grouped.index.map(
        lambda x: cluster_mapping.get(x, None))
    merged = pd.merge(df[df_vs],
                      grouped,
                      left_on=keys,
                      right_on=keys,
                      how='left').reset_index(False)
    reoder_cluster = {
        i: n
        for n, i in enumerate(
            merged.groupby('cluster')[resp].aggregate(lambda x: np.average(
                x, weights=merged.loc[x.index, w])).sort_values().index)
    }
    return merged['cluster'].map(reoder_cluster).values
def build_clusterer(data, nclusters, method, **kwargs):
    """
  A simple wrapper to various clustering approaches.
  Cluster the given data into nclusters by using the
  specified method. Depending on the specified method
  different packages may be required and different
  argumens are expected in the kwargs dict.
  """

    features = kwargs["config"]["features"]
    windows = []

    print("{0} cluster features used {1}".format(INFO, features))

    for window in data:

        window_data = window.get_rd_stats(statistics="all")
        window_values = []

        for feature in features:
            window_values.append(window_data[0][feature])
            window_values.append(window_data[1][feature])

        windows.append(window_values)

    if method == "kmeans":

        from sklearn.cluster import KMeans
        clusterer = KMeans(n_clusters=nclusters)

        clusterer.fit(windows)
        return clusterer
    elif method == "kmedoids":

        from pyclustering.cluster.kmedoids import kmedoids

        metric = get_distance_metric(
            dist_metric=kwargs["config"]["metric"].upper(),
            degree=kwargs["config"]["metric_degree"]
            if 'metric_degree' in kwargs["config"] else 0)

        initial_index_medoids = []
        if kwargs["config"]["init_cluster_idx"] == "random_from_data":
            import random

            for c in range(nclusters):
                idx = random.randint(0, len(windows) - 1)

                if idx in initial_index_medoids:

                    # try ten times before quiting
                    for time in range(10):
                        idx = random.randint(0, len(windows) - 1)

                        if idx in initial_index_medoids:
                            continue
                        else:
                            initial_index_medoids.append(idx)
                            break

                else:
                    initial_index_medoids.append(idx)
        else:
            initial_index_medoids = kwargs["config"]["init_cluster_idx"]

        clusterer = kmedoids(data=windows,
                             initial_index_medoids=initial_index_medoids,
                             metric=metric)
        clusterer.process()
        return clusterer, initial_index_medoids

    raise Error("Invalid clustering method: " + method)
Ejemplo n.º 38
0
#k-medoids é um cluster principal caracteristica dele é que o centro onde s cluster são inicializados ao invés de serem
#pontos aleatórios como no k-means o k-medoids usam pontos reais de dados

from sklearn import datasets
from sklearn.metrics import confusion_matrix
import numpy as np
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer

#carregamento dos dados
iris = datasets.load_iris()
#criando o objeto cluster / porém estão pegando as 2 primeiras colunas de iris(:, 0:2) para melhor intemdimento dos graficos do cluster, voce pode alterar para 4 se quisers
# Configuração dos parâmetros do k-medoids, utilizando somente as duas primeiras colunas da base de dados por causa da visualização apenas
# 3, 12 e 20 são índices aleatórios de registros da base de dados (inicialização)
cluster = kmedoids(
    iris.data[:, 0:2], [3, 12, 20]
)  #a sintaxe do pyto e de [:, 0:2] de 0 até 2 ignorando o último valor pegando 0 e 1
#visualização dos pontos escolhidos (3,12,20) aqui estou falando q vou usar o registro para iniciar dessas posições. apartir dali
cluster.get_medoids()

# Aplicação do algoritmo para o agrupamento, obtenção da previsões (grupo de cada registro) e visualização dos medoides
cluster.process()
previsoes = cluster.get_clusters()
medoides = cluster.get_medoids()
#lista de 3 elementos, com os indices dos registros do cluster
previsoes  #aqui são 3 lista olhando a imagen vc vê / é diferente você não vê o vetor de 0 e 1 ou vetor de propabilidade como era o c-means

#visualização do agrupamento / gerar o grafico do cluster com o centroides
v = cluster_visualizer()
v.append_clusters(previsoes, iris.data[:, 0:2])
v.append_cluster(medoides, data=iris.data[:, 0:2], marker='*', markersize=20)
Ejemplo n.º 39
0
 def testCoreInterfaceIntInputData(self):
     kmedoids_instance = kmedoids([ [1], [2], [3], [20], [21], [22] ], [ 2, 5 ], 0.025, True)
     kmedoids_instance.process()
     assert len(kmedoids_instance.get_clusters()) == 2
Ejemplo n.º 40
0
from sklearn import datasets
from sklearn.metrics import confusion_matrix
import numpy as np
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer

iris = datasets.load_iris()

cluster = kmedoids(iris.data[:, 0:2], [3, 12, 20])
cluster.get_medoids()
cluster.process()
previsoes = cluster.get_clusters()
medoides = cluster.get_medoids()

v = cluster_visualizer()
v.append_clusters(previsoes, iris.data[:, 0:2])
v.append_cluster(medoides, data = iris.data[:, 0:2], marker = '*', markersize = 15)
v.show()

lista_previsoes = []
lista_real = []
for i in range(len(previsoes)):
    print('----')
    print(i)
    print('----')
    for j in range(len(previsoes[i])):
        #print(j)
        print(previsoes[i][j])
        lista_previsoes.append(i)
        lista_real.append(iris.target[previsoes[i][j]])
        
Ejemplo n.º 41
0
def main():
    argparse.ArgumentParser(description="P4Fuzz Bugs Tamer")

    cnx = mysql.connector.connect(user="******",
                                  password="******",
                                  host="localhost",
                                  database="fuzzer")
    cursor = cnx.cursor()

    caseIds = []
    caseErrors = []

    cursor.execute("DELETE FROM tamed_bugs")
    cnx.commit()

    cursor.execute("SELECT id, error FROM bugs WHERE id < 3200")
    for (id, error) in cursor:
        caseIds.append(id)
        caseErrors.append(str(error))

    dt = datetime.now()
    print "Loading data from database..."

    dt2 = datetime.now()
    diff = dt2 - dt
    print str(diff.total_seconds() * 1000) + " Loaded data from database"

    dist_tuple = p4fuzzclib.calc_distance_matrix(caseErrors)
    dist = [list(x) for x in dist_tuple]

    dt3 = datetime.now()
    diff = dt3 - dt2
    print str(diff.total_seconds()) + " Calculated distances using token"

    # dists = [edit_distance(caseErrors[i], caseErrors[j])
    # 		for i in range(1, len(caseErrors))
    # 		for j in range(0, i)]
    #
    # dt7 = datetime.now()
    # diff = dt7 - dt3
    # print str(diff.total_seconds()) + " Calculated distances using lev"

    # sys.exit()

    initial_medoids = [0, len(caseErrors) - 1]

    kmedoids_instance = kmedoids(dist,
                                 initial_medoids,
                                 10,
                                 data_type='distance_matrix')
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()

    print "Clustered #1 ..."

    has_large = True
    cnt = 1
    while has_large:
        cnt += 1
        has_large = False
        for i, cluster in enumerate(clusters):
            medoid = medoids[i]
            medoid_distances = dist[medoid]
            max_points = p4fuzzclib.calc_max_distance_cluster(
                dist_tuple, cluster)
            max_dist = dist[max_points[0]][max_points[1]]
            if max_dist > 60:
                has_large = True
                new_medoid = max_points[0] if medoid_distances[max_points[
                    0]] > medoid_distances[max_points[1]] else max_points[1]
                initial_medoids = medoids
                initial_medoids.append(new_medoid)
                kmedoids_instance = kmedoids(dist,
                                             initial_medoids,
                                             100,
                                             data_type='distance_matrix')
                kmedoids_instance.process()
                clusters = kmedoids_instance.get_clusters()
                medoids = kmedoids_instance.get_medoids()
            else:
                print "Cluster " + str(i) + ": " + str(max_dist)
        print "Clustered #" + str(cnt) + " ..."

    dt4 = datetime.now()
    diff = dt4 - dt3
    print str(diff.total_seconds() * 1000) + " Clustering finished"

    for i, cluster in enumerate(clusters):
        medoid = medoids[i]
        for error_index in cluster:
            is_medoid = True if medoid == error_index else False
            data = (caseIds[error_index], i, is_medoid)
            cursor.execute(
                "INSERT INTO tamed_bugs (`bug_id`, `cluster`, `is_medoid`) VALUES (%s, %s, %s)",
                data)
            cnx.commit()

    dt5 = datetime.now()
    diff = dt5 - dt4
    print str(
        diff.total_seconds() * 1000
    ) + " Tamed bugs clusters inserted into database finished! All Done!"

    dt6 = datetime.now()
    diff = dt6 - dt
    print "Total time: " + str(diff.total_seconds())
Ejemplo n.º 42
0
def run_test_clustered(
    classes,
    rounds,
    n_aug_sample_points,
    n_train,
    n_jobs,
    cv,
    use_GPU,
    batch_size,
    dataset,
    aug_transformation,
    aug_kw_args,
    logistic_reg__C,
    CNN_extractor_max_iter,
    use_loss,
    experiment_configs,
    results_filename,
    model_filename,
    n_clusters,
    cluster_type="kmeans",
    #cluster_type="kmedoids",
):
    run_params = {
        "classes": classes,
        "rounds": rounds,
        "n_aug_sample_points": n_aug_sample_points,
        "n_train": n_train,
        "n_jobs": n_jobs,
        "cv": cv,
        "use_GPU": use_GPU,
        "batch_size": batch_size,
        "dataset": dataset.name,
        "aug_transformation": aug_transformation.name,
        "aug_kw_args": aug_kw_args,
        "logistic_reg__C": logistic_reg__C,
        "CNN_extractor_max_iter": CNN_extractor_max_iter,
        "use_loss": use_loss,
        "experiment_configs": experiment_configs,
        "results_filename": results_filename,
        "model_filename": model_filename,
        "n_clusters": n_clusters,
        "cluster_type": cluster_type,
    }

    pprint.pprint(run_params)

    assert n_aug_sample_points

    (x_train, y_train), (x_test, y_test) = experiments_util.prepare_dataset(
        dataset,
        classes,
        n_train,
    )
    print("Train class breakdown: {}".format(
        np.unique(y_train, return_counts=True)))
    print("Test class breakdown: {}".format(
        np.unique(y_test, return_counts=True)))

    aug_f = augmentations.get_transformation(aug_transformation)
    (orig_and_auged_x_train,
     orig_and_auged_y_train,
     orig_and_auged_idxs_train) = \
        experiments_util.poison_dataset(x_train,
                                        y_train,
                                        aug_f,
                                        aug_kw_args)
    (orig_and_auged_x_test,
     orig_and_auged_y_test,
     orig_and_auged_idxs_test) = \
        experiments_util.poison_dataset(x_test,
                                        y_test,
                                        aug_f,
                                        aug_kw_args)
    print("x_train", x_train.shape)
    print("orig_and_auged_x_train", orig_and_auged_x_train.shape)

    feature_clf = featurized_classifiers.build_featurized_ResNet_feature_clf(
        CNN_extractor_max_iter,
        use_GPU,
        batch_size,
        model_filename,
    )

    @mem.cache
    def transform_features(x, y, model_filename):
        # We need model filename to invalidate cache on model change
        return feature_clf.fit_transform(x, y=y)

    featurized_x_train = transform_features(
        x=x_train,
        y=y_train,
        model_filename=model_filename,
    )
    featurized_y_train = y_train
    featurized_x_test = transform_features(
        x=x_test,
        y=y_test,
        model_filename=model_filename,
    )
    featurized_y_test = y_test
    orig_and_auged_featurized_x_train = transform_features(
        x=orig_and_auged_x_train,
        y=orig_and_auged_y_train,
        model_filename=model_filename,
    )
    orig_and_auged_featurized_y_train = orig_and_auged_y_train
    orig_and_auged_featurized_x_train_to_source_idxs = \
        orig_and_auged_idxs_train
    orig_and_auged_featurized_x_test = transform_features(
        x=orig_and_auged_x_test,
        y=orig_and_auged_y_test,
        model_filename=model_filename,
    )
    orig_and_auged_featurized_y_test = orig_and_auged_y_test
    orig_and_auged_featurized_x_test_to_source_idxs = orig_and_auged_idxs_test

    if cluster_type == "kmeans":
        clustering_clf = sklearn.cluster.KMeans(n_clusters=n_clusters)
        train_cluster_IDs = clustering_clf.fit_predict(featurized_x_train)
        test_cluster_IDs = clustering_clf.predict(featurized_x_test)
    elif cluster_type == "kmedoids":
        from pyclustering.cluster.kmedoids import kmedoids
        from pyclustering.utils import timedcall
        import scipy.spatial
        # Using some code from kmedoids_examples.py from pyclustering
        clustering_clf = sklearn.cluster.KMeans(n_clusters=n_clusters)
        init_medoids = clustering_clf.fit_predict(featurized_x_train)
        #init_medoids = np.random.choice(len(featurized_x_train),
        #                                n_clusters,
        #                                replace=False)
        tolerance = 0.25
        kmedoids_instance = kmedoids(featurized_x_train, init_medoids,
                                     tolerance)
        (ticks, result) = timedcall(kmedoids_instance.process)  # Run
        cluster_IDs = kmedoids_instance.get_medoids(
        )  # index into training set
        clusters = featurized_x_train[cluster_IDs]
        tree = scipy.spatial.cKDTree(clusters)
        _, train_cluster_IDs = tree.query(featurized_x_train, 1)
        _, test_cluster_IDs = tree.query(featurized_x_test, 1)
    print("Train cluster IDs: {}".format(train_cluster_IDs))
    print("Test cluster IDs: {}".format(test_cluster_IDs))

    clf = featurized_classifiers.build_logistic_reg_clf(
        logistic_reg__C,
        cv,
    )

    svm__C = [0.01, 0.1, 1, 10, 100]
    svm_cv = 4
    is_SV = experiments_util.get_SV_raw(featurized_x_train, featurized_y_train,
                                        CNN_extractor_max_iter, use_GPU,
                                        batch_size, svm__C, svm_cv, n_jobs)
    SVM_losses = experiments_util.get_SVM_losses_raw(featurized_x_train,
                                                     featurized_y_train,
                                                     CNN_extractor_max_iter,
                                                     use_GPU, batch_size,
                                                     svm__C, svm_cv, n_jobs)
    print("Number of support vectors is: {}".format(np.sum(is_SV)))
    SV_idxs = np.where(is_SV)[0]
    orig_and_SV_idxs = np.concatenate([SV_idxs, [-1]])
    print("orig_and_SV_idxs", orig_and_SV_idxs)
    print("orig_and_SV_idxs", orig_and_SV_idxs.shape)
    SV_orig_and_auged_mask = np.isin(orig_and_auged_idxs_train,
                                     orig_and_SV_idxs)
    print("SV_orig_and_auged_mask count {}/{}".format(
        np.sum(SV_orig_and_auged_mask),
        len(SV_orig_and_auged_mask),
    ))
    SV_x_train = orig_and_auged_featurized_x_train[SV_orig_and_auged_mask]
    SV_y_train = orig_and_auged_featurized_y_train[SV_orig_and_auged_mask]
    clf.fit(SV_x_train, SV_y_train)
    VSV_acc = clf.score(orig_and_auged_featurized_x_test,
                        orig_and_auged_featurized_y_test)
    print("VSV acc: {}".format(VSV_acc))

    np_data_dict = {
        "x_train": orig_and_auged_x_train,
        "y_train": orig_and_auged_y_train,
        "train_to_source_idxs": orig_and_auged_idxs_train,
        "featurized_x_train": orig_and_auged_featurized_x_train,
        "featurized_y_train": orig_and_auged_featurized_y_train,
        "x_test": orig_and_auged_x_test,
        "y_test": orig_and_auged_y_test,
        "test_to_source_idxs": orig_and_auged_idxs_test,
        "featurized_x_test": orig_and_auged_featurized_x_test,
        "featurized_y_test": orig_and_auged_featurized_y_test,
        "SV_x_train": orig_and_auged_x_train[SV_orig_and_auged_mask],
        "SV_y_train": orig_and_auged_y_train[SV_orig_and_auged_mask],
        "featurized_SV_x_train": SV_x_train,
        "featurized_SV_y_train": SV_y_train,
    }

    np_data_filename = results_filename + "_data.npz"
    np.savez(np_data_filename, **np_data_dict)

    (no_aug_no_poison_acc, poisoned_acc, all_aug_train_poisoned_acc,
     aug_scores, after_aug_scores, best_params,
     training_total_time) = experiments_util.train_and_score_clf(
         clf,
         featurized_x_train,
         y_train,
         featurized_x_test,
         y_test,
         orig_and_auged_featurized_x_train,
         orig_and_auged_featurized_y_train,
         orig_and_auged_featurized_x_test,
         orig_and_auged_featurized_y_test,
         use_loss,
         cv,
     )
    training_end_time = time.time()

    experiment_results = {}
    for policy_name, update_score, downweight_points in experiment_configs:
        policy_f = selection_policy.get_policy_by_name(policy_name)
        if "deterministic" in policy_name:
            _rounds = 1
        else:
            _rounds = rounds
        acc, idxs = experiments.precomputed_aug_experiment_rounds(
            clf=clf,
            auged_featurized_x_train=orig_and_auged_featurized_x_train,
            auged_featurized_y_train=orig_and_auged_featurized_y_train,
            auged_featurized_x_train_to_source_idxs=
            orig_and_auged_featurized_x_train_to_source_idxs,
            auged_featurized_x_test=orig_and_auged_featurized_x_test,
            auged_featurized_y_test=orig_and_auged_featurized_y_test,
            auged_featurized_x_test_to_source_idxs=
            orig_and_auged_featurized_x_test_to_source_idxs,
            aug_iter=policy_f,
            train_idxs_scores=aug_scores,
            n_aug_sample_points=n_aug_sample_points,
            rounds=_rounds,
            update_scores=update_score,
            weight_aug_samples=downweight_points,
            use_loss=use_loss,
            stratified_sampling_x_train_ks=train_cluster_IDs,
        )

        config_name = [policy_name]
        if update_score:
            config_name.append("update")
        if downweight_points:
            config_name.append("downweight")
        config_name = "_".join(config_name)
        experiment_results[config_name] = acc

    all_results = {
        "no_aug_no_poison_acc": no_aug_no_poison_acc,
        "poisoned_acc": poisoned_acc,
        "all_aug_train_poisoned_acc": all_aug_train_poisoned_acc,
        "is_SV": is_SV,
        "VSV_acc": VSV_acc,
        "best_params": best_params,
        "initial_aug_scores": aug_scores,
        "after_aug_scores": after_aug_scores,
        "experiment_results": experiment_results,
        "n_aug_sample_points": n_aug_sample_points,
        "run_parameters": run_params,
        "n_train": n_train,
        "rounds": rounds,
    }

    tests_total_time = time.time() - training_end_time

    all_results["tests_total_runtime"] = tests_total_time

    pprint.pprint(all_results)
    np.savez(
        results_filename,
        **all_results,
    )

    print("*" * 80)
    print("Training took {} seconds".format(training_total_time))
    print("All tests took {} seconds".format(tests_total_time))
    print("*" * 80)