Esempio n. 1
0
    def time_subcluster(self, locs):
        # Getting subclusters at Mapzen's limit
        cluster_linkage = linkage(locs, method='ward')
        clusters = fcluster(cluster_linkage, 50, criterion='maxclust')

        cluster_means = np.array([np.mean(
            locs[np.where(clusters == i)], axis=0
        ) for i in range(1, 51)])

        mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means]
        mapzen_matrix = self.mapzen_matrix(mapzen_locs)

        # Cluster labels used for mapping back together
        # Subtracting one to use 0 index
        cl = clusters - 1

        # Get a matching distance matrix of lat/lon distance, get ratios
        cluster_km_dist = squareform(pdist(cluster_means,
                                           (lambda u,v: haversine(u,v))))

        dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix,
                                                    cluster_km_dist))
        # Divide items by mean to normalize a bit
        dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix,
                                                    dist_ratio_matrix.mean()))

        locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v))))

        # Iterate through each, updating by ratio in dist_ratio_matrix
        it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]]
            it.iternext()

        return locs_km_dist
Esempio n. 2
0
def writeClusters(results):
    threshold = 0.9
    results = numpy.fromiter(results, dtype=[('pairs', 'i8', 2), ('score', 'f4', 1,)])
    i_to_id, condensed_distances, N = condensedDistance(results)
    linkages = fastcluster.linkage(condensed_distances, method='ward')
    partition = hcluster.fcluster(linkages, threshold, criterion='inconsistent')
    clusters = {}
    for (i, cluster_id) in enumerate(partition):
        clusters.setdefault(cluster_id, []).append(i_to_id[i])
    i = 0
    for cluster in clusters.values():
        images = []
        for index in cluster:
            image_name = all_images[index]
            image_path = os.path.join(imagedir, image_name)
            cluster_path = 'clustered_images/{0}'.format(str(i))

            # There must be a better way to do this
            try:
                os.mkdir(cluster_path)
            except OSError:
                for f in os.listdir(cluster_path):
                    try:
                        os.remove(f)
                    except OSError:
                        pass
            print('writing %s' % image_name)
            with open(image_path, 'rb') as inp:
                with open(os.path.join('clustered_images', str(i), image_name), 'wb') as outp:
                    outp.write(inp.read())
        i += 1
def time_series_clusters(Y,ct=0.5,return_clusters=False):
	D = pdist(transpose(Y),'correlation')
	D = abs(D)
	if return_clusters:
		L = linkage(D,method='single',metric='cosine')
		C = fcluster(L,ct,criterion='distance')
		return cluster_sets(C)
	plot_clusters(D,ct)
def time_series_clusters(Y, ct=0.5, return_clusters=False):
    D = pdist(transpose(Y), 'correlation')
    D = abs(D)
    if return_clusters:
        L = linkage(D, method='single', metric='cosine')
        C = fcluster(L, ct, criterion='distance')
        return cluster_sets(C)
    plot_clusters(D, ct)
Esempio n. 5
0
    def __call__(self):
        # Can continue to play around with these
        self.cluster_linkage = linkage(self.point_arr, method='ward')
        self.clusters = fcluster(self.cluster_linkage,
                                 self.num_clusters,
                                 criterion='maxclust')

        [p[0].update({'group': p[1]}) for p in zip(self.locations, self.clusters.tolist())]

        return self.locations
Esempio n. 6
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            (i_to_id, condensed_distances) = condensedDistance(sub_graph)
            N = max(i_to_id) + 1

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for (i, sub_cluster_id) in enumerate(partition):
                clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

            cophenetic_distances = hcluster.cophenet(linkage)

            for cluster_id, items in clusters.iteritems() :
                if len(items) > 1 :
                    score = clusterConfidence(items, cophenetic_distances, N)
                    clustering[cluster_id] = (tuple(i_to_id[item] 
                                                    for item in items),
                                              1 - score)

            cluster_id += max(partition) + 1
        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = tuple(ids), score
            cluster_id += 1
            

    return clustering.values()
Esempio n. 7
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            (i_to_id, condensed_distances) = condensedDistance(sub_graph)
            N = max(i_to_id) + 1

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for (i, sub_cluster_id) in enumerate(partition):
                clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

            cophenetic_distances = hcluster.cophenet(linkage)

            for cluster_id, items in clusters.iteritems() :
                if len(items) > 1 :
                    score = clusterConfidence(items, cophenetic_distances, N)
                    clustering[cluster_id] = (tuple(i_to_id[item] 
                                                    for item in items),
                                              1 - score)

            cluster_id += max(partition) + 1
        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = tuple(ids), score
            cluster_id += 1
            

    return clustering.values()
Esempio n. 8
0
def cluster(dupes, threshold=.5):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''

    threshold = 1 - threshold

    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]

    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in dupes)

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 2:
            pair_gen = ((x[0:2], x[2]['weight'])
                        for x in dupe_graph.edges_iter(sub_graph, data=True))

            pairs = numpy.fromiter(pair_gen, dtype=score_dtype)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage,
                                          threshold,
                                          criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id,
                                      []).append(i_to_id[i])

            cluster_id += max(partition)
        else:

            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    return clusters
def wavelet_clusters(Y,ct=0.5,weights=False,return_clusters=False,swt=False):
	if weights:
		D = abs(c_dists(Y,level_weights=True,use_swt=False))
		Dr = []
		for i in range(D.shape[0]-1):
			Dr += list(D[i,i+1:])
	else:
		Dr = c_dists(Y,use_swt=swt)
	if return_clusters:
		L = linkage(Dr,method='single',metric='cosine')
		C = fcluster(L,ct,criterion='distance')
		return cluster_sets(C)
	plot_clusters(Dr,ct)
Esempio n. 10
0
def cluster(dupes, threshold=.5):
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    """
    threshold = 1 - threshold

    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]


    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from(((x[0], x[1], y) for x, y in dupes))
    del dupes

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs :
        if len(sub_graph) > 2 :
            pair_gen = ((x[0:2], x[2]['weight'])
                        for x
                        in dupe_graph.edges_iter(sub_graph, data=True))
                
            pairs = numpy.fromiter(pair_gen, dtype=score_dtype)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, threshold, criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id, []).append(i_to_id[i])
            cluster_id += max(partition)

        else :
            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    
    return clusters
Esempio n. 11
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            i_to_id, condensed_distances, N = condensedDistance(sub_graph)

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=True)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for i, partition_id in enumerate(partition):
                clusters.setdefault(partition_id, []).append(i)

            for items in viewvalues(clusters) :
                if len(items) > 1 :
                    items = tuple(items)
                    scores = confidences(items, condensed_distances, N)
                    clustering[cluster_id] = (tuple(i_to_id[i] for i in items),
                                              scores)
                    cluster_id += 1

        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = (tuple(ids), tuple([score]*2))
            cluster_id += 1
            

    return clustering.values()
Esempio n. 12
0
  def cluster_analysis_hcluster(self, vectors):
    from hcluster import linkage, fcluster
    import numpy

    params = self.params.multiple_lattice_search.cluster_analysis.hcluster
    X = numpy.array(vectors)
    linkage_method = params.linkage.method
    linkage_metric = params.linkage.metric
    criterion = params.cutoff_criterion
    Z = linkage(X, method=linkage_method, metric=linkage_metric)
    cutoff = params.cutoff
    i_cluster = fcluster(Z, cutoff, criterion=criterion)
    i_cluster = flex.int(i_cluster.astype(numpy.int32))
    return i_cluster
Esempio n. 13
0
def wavelet_clusters(Y,
                     ct=0.5,
                     weights=False,
                     return_clusters=False,
                     swt=False):
    if weights:
        D = abs(c_dists(Y, level_weights=True, use_swt=False))
        Dr = []
        for i in range(D.shape[0] - 1):
            Dr += list(D[i, i + 1:])
    else:
        Dr = c_dists(Y, use_swt=swt)
    if return_clusters:
        L = linkage(Dr, method='single', metric='cosine')
        C = fcluster(L, ct, criterion='distance')
        return cluster_sets(C)
    plot_clusters(Dr, ct)
Esempio n. 14
0
def cluster_struct_affinities(affinities, 
                              ctype = 'mlpy',
                              k = None):
    ''' Return a list of cluster memberships in the form of an N-array having 
k unique elements.
'''
    if ctype == 'hcluster':
        return hcluster.fcluster(vecs,1.1,criterion='inconsistent',method = 'complete' )

    elif ctype == 'mlpy':
        import mlpy
        HC = mlpy.HCluster(method='euclidean', link='complete')
        clusts = HC.compute(vecs )
        cut = HC.cut(HC.heights[-k])
        return cut
    else: 
        raise Exception()
Esempio n. 15
0
def cluster(dupes: numpy.ndarray,
            threshold: float = .5,
            max_components: int = 30000) -> Clusters:
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number between 0 and 1 (default is .5). lowering the
                 number will increase precision, raising it will increase
                 recall
    '''
    distance_threshold = 1 - threshold
    dupe_sub_graphs = connected_components(dupes, max_components)

    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            i_to_id, condensed_distances, N = condensedDistance(sub_graph)

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=True)

            partition = hcluster.fcluster(linkage,
                                          distance_threshold,
                                          criterion='distance')

            clusters: Dict[int, List[int]] = defaultdict(list)

            for i, cluster_id in enumerate(partition):
                clusters[cluster_id].append(i)

            squared_distances = condensed_distances**2
            for cluster in clusters.values():
                if len(cluster) > 1:
                    scores = confidences(cluster, squared_distances, N)
                    yield tuple(i_to_id[i] for i in cluster), scores

        else:
            (ids, score), = sub_graph
            if score > threshold:
                yield tuple(ids), (score, ) * 2
Esempio n. 16
0
    def cluster(self):
        """Cluster strokes"""

        # the purpose of this step is to cluster strokes using
        # the previously calculated distance matrix

        matrix = numpy.load(self.DTW_DATA)
        Y = hcluster.squareform(matrix)
        Z = hcluster.linkage(Y, method=self.CLUSTERING_METHOD)
        T = hcluster.fcluster(Z, 1.15)
        clusters = self.get_cluster_dict_from_array(T)

        if self.verbose:
            self.print_clusters(clusters)

        if not os.path.exists(self.CLUSTER_ROOT):
            os.makedirs(self.CLUSTER_ROOT)

        pickle.dump(clusters, open(self.CLUSTER_DATA, "w"))
Esempio n. 17
0
def hierarchical_cluster(clusters, threshold):
    threshold = 1 - threshold
    score_dtype = [('pairs', 'i4', 2), ('score', 'f4', 1)]

    #    lclassifier.predict_proba(distances)[0][1] > threshold

    dupe_graph = networkx.Graph()
    dupe_graph.add_weighted_edges_from((x[0], x[1], y) for (x, y) in clusters)

    dupe_sub_graphs = connected_components(dupe_graph)

    clustering = {}
    cluster_scores = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 2:
            pair_gen = ((sorted(x[0:2]), x[2]['weight'])
                        for x in dupe_graph.edges_iter(sub_graph, data=True))

            pairs = np.fromiter(pair_gen, dtype=score_dtype)
            pairlist = list(pairs)

            (i_to_id, condensed_distances) = condensedDistance(pairs)
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage,
                                          threshold,
                                          criterion='distance')

            for (i, sub_cluster_id) in enumerate(partition):
                clustering.setdefault(cluster_id + sub_cluster_id,
                                      []).append(i_to_id[i])

            cluster_id += max(partition)
        elif len(sub_graph) == 2:
            clustering[cluster_id] = sub_graph
            cluster_id += 1

    clusters = [set(l) for l in clustering.values() if len(l) >= 2]
    return (clusters, cluster_scores)
Esempio n. 18
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the
                 number will increase precision, raising it will increase
                 recall
    '''
    distance_threshold = 1 - threshold
    dupe_sub_graphs = connected_components(dupes, max_components)

    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            i_to_id, condensed_distances, N = condensedDistance(sub_graph)

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=True)

            partition = hcluster.fcluster(linkage,
                                          distance_threshold,
                                          criterion='distance')

            clusters = defaultdict(list)

            for i, cluster_id in enumerate(partition):
                clusters[cluster_id].append(i)

            for cluster in viewvalues(clusters):
                if len(cluster) > 1:
                    scores = confidences(cluster, condensed_distances, N)
                    yield tuple(i_to_id[i] for i in cluster), scores

        else:
            (ids, score), = sub_graph
            if score > threshold:
                yield tuple(ids), (score,) * 2
def cluster(dupes, threshold=.5):
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    """
    (i_to_id, condensed_distances) = condensedDistance(dupes)
    linkage = fastcluster.linkage(numpy.array(condensed_distances),
                                  method='centroid')
    partition = hcluster.fcluster(linkage, threshold)

    clustering = {}

    for (i, cluster_id) in enumerate(partition):
        clustering.setdefault(cluster_id, []).append(i_to_id[i])

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    return clusters
Esempio n. 20
0
def cluster(dupes, threshold=.5):
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    """
    (i_to_id, condensed_distances) = condensedDistance(dupes)
    linkage = fastcluster.linkage(numpy.array(condensed_distances),
                                  method='centroid')
    partition = hcluster.fcluster(linkage, threshold)

    clustering = {}

    for (i, cluster_id) in enumerate(partition):
        clustering.setdefault(cluster_id, []).append(i_to_id[i])

    clusters = [set(l) for l in clustering.values() if len(l) > 1]

    return clusters
import hcluster

# For using with published example:
# url = "http://examples.obspy.org/dissimilarities.pkl"
# dissimilarity = pickle.load(urllib.urlopen(url))
dissimilarity = pickle.load(open("dissimilarities.pkl", "rb"))
# Storing local pickle object:
# dis_temp = pickle.dumps(dissimilarity)
# pickle.dump(dissimilarity, open("dissimilarities.pkl","wb"))

plt.subplot(121)
plt.imshow(dissimilarity, interpolation="nearest")
dissimilarity = hcluster.squareform(dissimilarity)
threshold = 0.3
linkage = hcluster.linkage(dissimilarity, method="single")
clusters = hcluster.fcluster(linkage, threshold, criterion="distance")

plt.subplot(122)
hcluster.dendrogram(linkage, color_threshold=threshold)
plt.show()
'''
>> print hcluster.__doc__
Function Reference
------------------

These functions cut hierarchical clusterings into flat clusterings
or find the roots of the forest formed by a cut by providing the flat
cluster ids of each observation.

+------------------+-------------------------------------------------+
|*Function*        | *Description*                                   |
Esempio n. 22
0
def maxclust_dists(dists, k, method = 'complete'):
    d2 = hcluster.squareform(dists)
    Z = hcluster.linkage(d2, method = method)
    fcl = hcluster.fcluster(Z, t = k, criterion = 'maxclust')
    return fcl
    users = pd.read_csv('data/mindmatch_example.csv').to_dict(orient='records')
    n_users = len(users)
    print('Number of registered users: {}'.format(n_users))

    users_df = pd.DataFrame(users).fillna('')
    users_dict = {r['user_id']: dict(r) for _, r in users_df.iterrows()}  # map of user id to details
    persons_1 = list(map(preprocess, list(users_df['abstracts'])))
    persons_2 = list(map(preprocess, list(users_df['abstracts'])))
    A = affinity_computation(persons_1, persons_2,
                             n_components=30, min_df=2, max_df=0.8,
                             weighting='tfidf', projection='svd')
    cois_list = compute_conflicts(users_df)
    for i, j in cois_list:
        A[i, j] = -1

    A_cluster = - A
    A_cluster[A_cluster == 1000] = 1
    A_rand = np.random.randn(n_users, n_users) * 0.01 * A_cluster.var() # add randomness

    z = linkage(A_cluster + A_rand,
                method='average',
                metric='euclidean',
                optimal_ordering=True)
    cluster = hcluster.fcluster(z, t=0.01,
                                criterion='distance') # distance
    users_group_df['cluster'] = cluster
    users_sorted_df  = users_group_df.sort_values('cluster')
    cluster_numbers = generate_pod_numbers(n_users=len(users_sorted_df), n_per_group=5)
    users_sorted_df['cluster'] = cluster_numbers
    users_sorted_df.to_csv('group_matching_users.csv', index=False)
Esempio n. 24
0
import hcluster
import matplotlib.pyplot as plt
import pickle
import urllib

url = "http://examples.obspy.org/dissimilarities.pkl"
dissimilarity = pickle.load(urllib.urlopen(url))

plt.subplot(121)
plt.imshow(1 - dissimilarity, interpolation="nearest")

dissimilarity = hcluster.squareform(dissimilarity)
threshold = 0.3
linkage = hcluster.linkage(dissimilarity, method="single")
clusters = hcluster.fcluster(linkage, 0.3, criterion="distance")

plt.subplot(122)
hcluster.dendrogram(linkage, color_threshold=0.3)
plt.xlabel("Event number")
plt.ylabel("Dissimilarity")
plt.show()
Esempio n. 25
0
def fclusters(Z, seed_pts, nclusters):
        flats = hcluster.fcluster(Z, t=nclusters, criterion='maxclust')
        clusters = [set() for _ in range(max(flats))]
        for idx, f in enumerate(flats):
            clusters[f-1].add(seed_pts[idx])
        return clusters
Esempio n. 26
0
def cluster(dupes: numpy.ndarray,
            cluster_threshold: float = 0.5,
            max_components: int = 30000,
            id_to_match: str = None) -> Clusters:
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    `https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html`



    Args:
        dupes: (np.array)[tuple(list[str], float)] A list of tuples, where each tuple
            contains an id pair and a probability that they are a match:
                id_pair_tuple: ([record_id_1, record_id_2], prob)
                dtype: np.dtype([('pairs', '<U256', 2),
                                 ('score', 'f4', 1)])
        threshold: (float) number betweent 0 and 1 (default is .5). lowering the
            number will increase precision, raising it will increase recall
    """
    distance_threshold = cluster_threshold
    score_threshold = 1 - cluster_threshold
    dupe_sub_graphs = connected_components(dupes, max_components)
    # logger.info(f"Dupes: {dupes}")
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:
            i_to_id, condensed_distances, N = condensed_distance(sub_graph)
            logger.debug(f"{condensed_distances}")
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=True)
            partition = hcluster.fcluster(linkage,
                                          distance_threshold,
                                          criterion='distance')

            clusters: Dict[int, List[int]] = defaultdict(list)
            logger.debug(f"Partition: {partition}")
            logger.debug(f"Linkage: {linkage}")
            for i, cluster_id in enumerate(partition):
                clusters[cluster_id].append(i)

            logger.info(f"Clusters: {clusters}")
            for cluster in clusters.values():
                if len(cluster) > 1:
                    scores = confidences(cluster, condensed_distances, N)
                    logger.info(
                        f"Cluster Ids and scores: {tuple(i_to_id[i] for i in cluster)}, {scores}"
                    )
                    ids = [i_to_id[i] for i in cluster]
                    if id_to_match in ids and id_to_match is not None:
                        yield tuple(ids), scores
                    elif id_to_match is None:
                        yield tuple(ids), scores

        else:
            (ids, score), = sub_graph
            if score > score_threshold and id_to_match in ids and id_to_match is not None:
                # logger.info(tuple(ids), ((score,) * 2))
                yield tuple(ids), (score, ) * 2
            elif score > score_threshold and id_to_match is None:
                yield tuple(ids), (score, ) * 2
import numpy
from numpy.random import rand
from main.util.common import dataPath

# load distance matrix

# Z = linkage(distanceMatrix)
# numpy.save("dendrogram.npy", Z)
# dendrogram(Z)
# show()

Z = numpy.load(dataPath("dendrogram.npy"))

dendrogram(Z)
show()

clu = fcluster(Z, 2, depth=5000, criterion='distance')

cluInstances = {}
for i in clu:
    cluInstances[i] = cluInstances.get(i, 0) + 1

# numpy.save(dataPath("clusters.npy"), clu)
# clu = numpy.load(dataPath("clusters.npy"))

"""hist1 = numpy.histogram(list(cluInstances.itervalues()))

print len(cluInstances.keys())
print hist1[0]
print map(lambda x: int(x), hist1[1])"""
Esempio n. 28
0
            row[i] = int( row[i] )
        for i in [6,7,8,9,10]:
            row[i] = float( row[i] )
        water_dict[ (row[12], row[5]) ] = row
        water_list.append( row )


X = [ row[6:9] for row in water_list ]
print X
Y = pdist(X, 'euclidean')

Z = linkage(Y, 'average')
print Z
dendrogram(Z)

fclust = fcluster(Z, 2, criterion='distance')


clust_dict = defaultdict( list )
for i, row in enumerate(water_list):
    #print fclust[i], str(fclust[i])
    clust_dict[ str(fclust[i]) ].append( row )

#print clust_dict

for c in clust_dict:
    print 'select water and (' + ' or '.join( [ '(~' + w[12] + ' and ' + str(w[5]) + ')' for w in clust_dict[c] ] ) + '); isosurface id "foo' + c + '" color lightblue center {selected} SPHERE @{ [ {selected}.x.stddev, {selected}.y.stddev, {selected}.z.stddev, 0.5  ].max *2 } translucent' + ';'


sys.exit(0)