Esempio n. 1
0
def robust_single_linkage(metric, *, k_neighbors, alpha=None, force_ultrametric=False):
    """Robust single linkage algorithm of (Chaudhuri and Dasgupta 2010).

    Args
    ----
    metric : ndarray
        The distance array to transform, in condensed or square form.
    k_neighbors : int, keyword-only
        The number of neighbors to use in the transformation. Must be 
        a non-negative integer.
    alpha : float, keyword-only, optional
        The connection parameter used in the transformation. At time :math:`r`,
        all edges within :math:`\\alpha r` are connected. Default: 
        :math:`\sqrt{2}`.
    force_ultrametric : bool, keyword-only, optional
        If `True`, the resulting clustering is forced to be ultrametric (each
        point is admitted at radius zero).

    Returns
    -------
    clustering : Clustering
        The robust single linkage clustering.

    """
    kwargs = dict(metric=metric, k_neighbors=k_neighbors, alpha=alpha)
    robust_metric, levels = robust_single_linkage_metric(**kwargs)

    if force_ultrametric:
        levels = None

    # note: _hier.single requires condensed metric!
    linkage = _hier.single(robust_metric)
    return Clustering.from_linkage(linkage, levels=levels)
def find_closest_two_points(points):
	
	# use single link clustering to find closest two points
	p = pdist(points)
	slc = single(p)
	
	return np.array([points[slc[0][0]], points[slc[0][1]]])
Esempio n. 3
0
def make_tree(X, C, method='single'):
    if method == 'single':
        tree = to_tree(single(C))
    elif method == 'ward':
        tree = to_tree(ward(X))
    elif method == 'average':
        tree = to_tree(average(C))
    return Tree(root=construct_node(tree))
Esempio n. 4
0
def group_tuples(items=None, val_ind=None, dist_thresh = 0.1, distance_matrix=None, 
                 metric='jaccard', linkage='complete', sp_areas=None):
    '''
    items: a dict or list of tuples
    val_ind: the index of the item of interest within each tuple
    '''
    
    if distance_matrix is not None:
        if items is not None:
            if isinstance(items, dict):
                keys = items.keys()
                values = items.values()
            elif isinstance(items, list):
                keys = range(len(items))
                if isinstance(items[0], tuple):
                    values = map(itemgetter(val_ind), items)
                else:
                    values = items
    else:
        if isinstance(items, dict):
            keys = items.keys()
            values = items.values()
        elif isinstance(items, list):
            keys = range(len(items))
            if isinstance(items[0], tuple):
                values = map(itemgetter(val_ind), items)
            else:
                values = items
        else:
            raise Exception('clusters is not the right type')

        assert items is not None, 'items must be provided'
        distance_matrix = compute_pairwise_distances(values, metric, sp_areas=sp_areas)
    
    if items is None:
        assert distance_matrix is not None, 'distance_matrix must be provided.'    
        
    if linkage=='complete':
        lk = complete(squareform(distance_matrix))
    elif linkage=='average':
        lk = average(squareform(distance_matrix))
    elif linkage=='single':
        lk = single(squareform(distance_matrix))

    # T = fcluster(lk, 1.15, criterion='inconsistent')
    T = fcluster(lk, dist_thresh, criterion='distance')
    
    n_groups = len(set(T))
    groups = [None] * n_groups

    for group_id in range(n_groups):
        groups[group_id] = np.where(T == group_id+1)[0]

    index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0]
    item_groups = [[items[i] for i in g] for g in groups if len(g) > 0]
    
    return index_groups, item_groups, distance_matrix
Esempio n. 5
0
    def cluster_agglomerative(self, n, with_topic):
        #self.n_clusters = n
        if with_topic is True:
            X = self.X_with_topics
        else:
            X = self.X_without_topics

        z = hierarchy.single(pdist(X.toarray(), 'cityblock'))
        self.clusterizer_labels = hierarchy.fcluster(z,n, criterion = 'maxclust')
        cluster_set = set(self.clusterizer_labels)
        self.n_clusters = len(cluster_set) #- (1 if -1 in self.clusterizer_labels else 0)
        if with_topic is True:
            for i in cluster_set: #initialize a dict with cluster no as key to a list
                self.clusters[i] = []
            #adding the actual topics of the cluster to this dict. Mapping of label -> [list of actual topics for the documents clustered in this label]
            for i in range(0,len(self.topic_per_doc)):
                self.clusters[self.clusterizer_labels[i]].append(self.topic_per_doc[i])
            self.generate_piechart_pdf("hierarchical")
        else:
            self.calculate_and_print_silhouette(self.X_without_topics, self.clusterizer_labels, "DBSCAN")
Esempio n. 6
0
	def CalculateClusterTree(self):
		fullMatrix = self.GenerateFullMatrix(self.results)
		dissMatrix = []
		labels = fullMatrix.keys()
		for i in xrange(0, len(labels)):
			sampleNameI = labels[i]
			for j in xrange(i+1, len(labels)):
				sampleNameJ = labels[j]
				dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])
				
		# calculate hierarchical cluster tree
		if self.radioSingleLinkage.GetValue():
			linkageMatrix = single(dissMatrix)
		elif self.radioUPGMA.GetValue():
			linkageMatrix = average(dissMatrix)
		elif self.radioCompleteLinkage.GetValue():
			linkageMatrix = complete(dissMatrix)
		elif self.radioWeighted.GetValue():
			linkageMatrix = weighted(dissMatrix)
			
		root = to_tree(linkageMatrix)
		
		# create Newick string
		return self.CreateNewickString(root, labels) + ';'
Esempio n. 7
0
 def check_fcluster_monocrit(self, t):
     expectedT = hierarchy_test_data.fcluster_distance[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, t, criterion='monocrit', monocrit=maxdists(Z))
     assert_(is_isomorphic(T, expectedT))
Esempio n. 8
0
 def check_fcluster(self, t, criterion):
     # Tests fcluster(Z, criterion=criterion, t=t) on a random 3-cluster data set.
     expectedT = getattr(hierarchy_test_data, 'fcluster_' + criterion)[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, criterion=criterion, t=t)
     assert_(is_isomorphic(T, expectedT))
    def filter_corr_hierarchical(cor_matrix,
                                 method='complete',
                                 draw_plot=False):
        """
        Creates a filtered correlation matrix using hierarchical clustering methods from an empirical
        correlation matrix, given that all values are non-negative [0 ~ 1]

        This function allows for three types of hierarchical clustering - complete, single, and average
        linkage clusters. Link to hierarchical clustering methods documentation:
        `<https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html>`_

        It works as follows:

        First, the method creates a hierarchical clustering tree using scipy's hierarchical clustering methods
        from the empirical 2-D correlation matrix.

        Second, it extracts and stores each cluster's filtered value (alpha) and assigns it to it's corresponding leaf.

        Finally, we create a new filtered matrix by assigning each of the correlations to their corresponding
        parent node's alpha value.

        :param cor_matrix: (np.array) Numpy array of an empirical correlation matrix.
        :param method: (str) Hierarchical clustering method to use. (``complete`` by default, ``single``, ``average``)
        :param draw_plot: (bool) Plots the hierarchical cluster tree. (False by default)
        :return: (np.array) The filtered correlation matrix.
        """

        # Check if all matrix elements are positive
        if np.any(cor_matrix < 0):
            warnings.warn(
                'Not all elements in matrix are positive... Returning unfiltered matrix.',
                UserWarning)
            return cor_matrix

        # Check if matrix is 2-D
        if len(cor_matrix.shape) == 2:
            cor_x, cor_y = cor_matrix.shape
        else:
            warnings.warn(
                'Invalid matrix dimensions, input must be 2-D array... Returning unfiltered matrix.',
                UserWarning)
            return cor_matrix

        # Check if matrix dimensions and diagonal values are valid.
        if cor_x == cor_y and np.allclose(
                np.diag(cor_matrix),
                1):  # using np.allclose as diag values might be 0.99999
            # Creating new coorelation condensed matrix for the upper triangle and dismissing the diagnol.
            new_cor = cor_matrix[np.triu_indices(cor_matrix.shape[0], k=1)]
        else:
            warnings.warn(
                'Invalid matrix, input must be a correlation matrix of size (m x m)... Returning unfiltered matrix.',
                UserWarning)
            return cor_matrix

        # Compute the hierarchical clustering tree
        if method == 'complete':
            z_cluster = complete(new_cor)
        elif method == 'single':
            z_cluster = single(new_cor)
        elif method == 'average':
            z_cluster = average(new_cor)
        else:
            warnings.warn(
                'Invalid method selected, please check docstring... Returning unfiltered matrix.',
                UserWarning)
            return cor_matrix

        # Plot the hierarchical cluster tree
        if draw_plot:
            fig = plt.figure(figsize=(10, 6))
            axis = fig.add_subplot(111)
            dendrogram(z_cluster, ax=axis)
            plt.show()

        # Creates a pd.DataFrame that will act as a dictionary where the index is the leaf node id, and the values are
        # thier corresponding cluster's alpha value
        alpha_values = z_cluster[:, 2]
        alphas = z_cluster[:, 0]
        df_alphas = pd.DataFrame(alpha_values, index=alphas)
        df_alphas.loc[z_cluster[0][1]] = alpha_values[0]

        # Creates the filtered correlation matrix
        alphas_sorterd = df_alphas.sort_index()
        alphas_x = np.tile(alphas_sorterd.values,
                           (1, len(alphas_sorterd.values)))
        filt_corr = np.maximum(alphas_x, alphas_x.T)
        np.fill_diagonal(filt_corr, 1)

        return filt_corr
Esempio n. 10
0
plt.show()
# 也可以查看交叉表频率计数
print(pd.crosstab(kmeans_3['cluster'], kmeans_3['Cultivar'], margins=True))
from scipy.cluster import hierarchy
# 再次加载葡萄酒数据集,并删除Cultivar列
wine = pd.read_csv('wine.csv')
wine = wine.drop('Cultivar', axis=1)
# 层次聚类有许多算法。可以使用matplotlib绘制结果
import matplotlib.pyplot as plt

wine_complete = hierarchy.complete(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_complete)
plt.show()

wine_single = hierarchy.single(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_single)
plt.show()

wine_averge = hierarchy.average(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_averge)
plt.show()

wine_centroid = hierarchy.centroid(wine)
fig = plt.figure()
dn = hierarchy.dendrogram(wine_centroid)
plt.show()

wine_complete = hierarchy.complete(wine)
len(similarity_constructor)

import scipy.spatial.distance as ssd

dist=np.zeros((13,13))
dist[:,0]=similarity_constructor.values


dist1=ssd.cdist(dist,dist,'euclidean')

dist1

import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

clusters_study=sch.fcluster(sch.single(dist1),0.2,criterion='distance') # Here I create 4 groups


# Just left to create the 4 regressions

clusters_study

# Buildings with similar characteristics 
similarity_constructor[clusters_study==4]
similarity_constructor[clusters_study==3]
similarity_constructor[clusters_study==2]
similarity_constructor[clusters_study==1]



df.columns
Esempio n. 12
0
 def check_fcluster(self, t, criterion):
     # Tests fcluster(Z, criterion=criterion, t=t) on a random 3-cluster data set.
     expectedT = getattr(hierarchy_test_data, 'fcluster_' + criterion)[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, criterion=criterion, t=t)
     assert_(is_isomorphic(T, expectedT))
Esempio n. 13
0
def group_tuples(items=None,
                 val_ind=None,
                 dist_thresh=0.1,
                 distance_matrix=None,
                 metric='jaccard',
                 linkage='complete',
                 sp_areas=None):
    '''
    items: a dict or list of tuples
    val_ind: the index of the item of interest within each tuple
    '''

    if distance_matrix is not None:
        if items is not None:
            if isinstance(items, dict):
                keys = items.keys()
                values = items.values()
            elif isinstance(items, list):
                keys = range(len(items))
                if isinstance(items[0], tuple):
                    values = map(itemgetter(val_ind), items)
                else:
                    values = items
    else:
        if isinstance(items, dict):
            keys = items.keys()
            values = items.values()
        elif isinstance(items, list):
            keys = range(len(items))
            if isinstance(items[0], tuple):
                values = map(itemgetter(val_ind), items)
            else:
                values = items
        else:
            raise Exception('clusters is not the right type')

        assert items is not None, 'items must be provided'
        distance_matrix = compute_pairwise_distances(values,
                                                     metric,
                                                     sp_areas=sp_areas)

    if items is None:
        assert distance_matrix is not None, 'distance_matrix must be provided.'

    if linkage == 'complete':
        lk = complete(squareform(distance_matrix))
    elif linkage == 'average':
        lk = average(squareform(distance_matrix))
    elif linkage == 'single':
        lk = single(squareform(distance_matrix))

    # T = fcluster(lk, 1.15, criterion='inconsistent')
    T = fcluster(lk, dist_thresh, criterion='distance')

    n_groups = len(set(T))
    groups = [None] * n_groups

    for group_id in range(n_groups):
        groups[group_id] = np.where(T == group_id + 1)[0]

    index_groups = [[keys[i] for i in g] for g in groups if len(g) > 0]
    item_groups = [[items[i] for i in g] for g in groups if len(g) > 0]

    return index_groups, item_groups, distance_matrix
Esempio n. 14
0
def linkage(dmat):
    square = squareform(dmat) #needed for linkage methdos
    linkmat = sch.single(square)
    return sch.fcluster(linkmat,0.0001)
Esempio n. 15
0
def scipyLinkage(dmat):
    return sch.fcluster(sch.single(dmat),0.01)
Esempio n. 16
0
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

# scipy
from scipy.cluster import hierarchy

# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot')
PATH = '/home/ec2-user/environment/islr/10_unsupervised_learning/figs/'

X = np.random.standard_normal((50, 2))
X[:25, 0] = X[:25, 0] + 3
X[:25, 1] = X[:25, 1] - 4
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 18))

for linkage, cluster, ax in zip(
    [hierarchy.complete(X),
     hierarchy.average(X),
     hierarchy.single(X)], ['c1', 'c2', 'c3'], [ax1, ax2, ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage')
plt.savefig(PATH + 'hierarchy.png', dpi=300)
plt.close()
Esempio n. 17
0
    tree = Phylo.read(StringIO(newick), 'newick')

    Phylo.draw_graphviz(tree, prog='neato')
    plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight')


X += np.random.normal(scale=0.01, size=X.shape)

pca = PCA(2)
pca.fit(X)

# X = pca.transform(X)
N, D = X.shape

C = pdist(X)
tree = to_tree(single(C))


def construct_node(snode):
    if snode.left is None and snode.right is None:
        return TreeLeaf(snode.get_id())
    node = TreeNode()
    node.add_child(construct_node(snode.left))
    node.add_child(construct_node(snode.right))
    return node


root = construct_node(tree)
linkage_tree = Tree(root=root)
plot_tree(linkage_tree, 'linkage_induced')
Esempio n. 18
0
hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'average')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))

hclust_model = cluster.AgglomerativeClustering(n_clusters = 2, linkage = 'complete')
hclust_model.fit(X)
print('Cluster labels: {}\n'.format(hclust_model.labels_))


print '''
*********************************************************************************************************************
                                 scipy: dendrogram
*********************************************************************************************************************
'''

# from: https://github.com/JWarmenhoven/ISLR-python/blob/master/Notebooks/Chapter%2010.ipynb

fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage')

plt.show()

Esempio n. 19
0
    plt.figure()
    newick = final_tree.to_newick()
    tree = Phylo.read(StringIO(newick), 'newick')

    Phylo.draw_graphviz(tree, prog='neato')
    plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight')
X += np.random.normal(scale=0.01, size=X.shape)

pca = PCA(2)
pca.fit(X)

# X = pca.transform(X)
N, D = X.shape

C = pdist(X)
tree = to_tree(single(C))

def construct_node(snode):
    if snode.left is None and snode.right is None:
        return TreeLeaf(snode.get_id())
    node = TreeNode()
    node.add_child(construct_node(snode.left))
    node.add_child(construct_node(snode.right))
    return node

root = construct_node(tree)
linkage_tree = Tree(root=root)
plot_tree(linkage_tree, 'linkage_induced')


if args.tree:
Esempio n. 20
0
def condense_clones_file_and_barcode_mapping_file_by_tcrdist(
        old_clones_file,
        new_clones_file,
        tcrdist_threshold,
        organism,
        output_distfile=None
):

    tcrdist_calculator = TcrDistCalculator(organism)

    df = pd.read_csv(old_clones_file, sep='\t')

    # in conga we usually also have cdr3_nucseq but we don't need it for tcrdist; we also don't need the jgene but hey
    tcrs = [ ( ( l.va_gene, l.ja_gene, l.cdr3a ), ( l.vb_gene, l.jb_gene, l.cdr3b ) ) for l in df.itertuples() ]
    ids = [ l.clone_id for l in df.itertuples() ]


    print(f'compute tcrdist distance matrix for {len(tcrs)} clonotypes')
    D = np.array( [ tcrdist_calculator(x,y) for x in tcrs for y in tcrs ] ).reshape( (len(tcrs), len(tcrs)) )


    all_barcodes = pd.read_csv(old_clones_file+'.barcode_mapping.tsv', sep='\t')
    all_barcodes.set_index('clone_id', inplace=True)
    all_barcodes = all_barcodes['barcodes']
    assert type(all_barcodes) is pd.Series
    N = D.shape[0]
    assert df.shape[0] == N

    DT = squareform(D, force='tovector')

    # single linkage clustering of the distance matrix: any clonotypes with dist<tcrdist_threshold
    #  should end up in the same cluster
    Z = hierarchy.single(DT)

    clusters = hierarchy.fcluster(Z, t=tcrdist_threshold, criterion='distance')
    clusters_set = sorted(set(clusters))

    new_clones_dfl = []
    new_bcmap_dfl = []

    cluster_centers = []
    for c in clusters_set:
        # choose a representative clone based on distance
        cmask = clusters==c
        members = np.nonzero(cmask)[0]
        assert len(members) == np.sum(cmask)
        if len(members) == 1:
            center = members[0]
        else:
            cdist = D[cmask,:][:,cmask]
            dists = np.sum(cdist,axis=1)/(len(members)-1)
            icenter = np.argmin(dists)
            center = members[icenter]
            print('center_avgdist: {:3d} {:7.2f} avg {:7.2f}'\
                  .format(len(members), dists[icenter], np.mean(dists)))
        cluster_centers.append(center)
        cdf = df[cmask]
        center_df = pd.Series( df.iloc[center] )
        clone_size = sum(x.clone_size for _,x in cdf.iterrows())
        center_df.clone_size = clone_size
        new_clones_dfl.append( center_df)
        cbarcodes = []
        for _,row in cdf.iterrows():
            cbarcodes.extend( all_barcodes[row.clone_id].split(',') )
        assert len(cbarcodes) == clone_size
        new_bcmap_dfl.append( dict(clone_id=center_df.clone_id, barcodes=','.join(cbarcodes)))

    new_clones_df = pd.DataFrame(new_clones_dfl)
    new_bcmap_df = pd.DataFrame(new_bcmap_dfl)['clone_id barcodes'.split()] # ensure order?

    new_clones_df.to_csv(new_clones_file, sep='\t', index=False)
    new_bcmap_df.to_csv(new_clones_file+'.barcode_mapping.tsv', sep='\t', index=False)

    if output_distfile is not None:
        new_D = D[cluster_centers,:][:,cluster_centers]
        np.savetxt( output_distfile, new_D.astype(float), fmt='%.1f')
Esempio n. 21
0
        for cluster in clusters:
            print(cluster)
            if cluster == current_cluster:
                s2 = time_series[serie_number]
                series_DTW = dtw.distance(s1, s2)
                calculated_sse = calculated_sse + np.math.pow(series_DTW, 2)
            serie_number += 1
        print('mpike re malaka')
        print(current_cluster)
        print("mexri edw")
        current_cluster += 1
    sse.append(calculated_sse)

pl.plot(range(3, 15), sse)
pl.title("Elbow Curve")
pl.xlabel("Number of clusters")
pl.ylabel("SSE")
pl.show()

print(calculated_sse)
print(cluster)

#cophenetic - features

from scipy.cluster.hierarchy import single, cophenet
from scipy.spatial.distance import pdist, squareform

Z = single(pdist(ret_var))
cophenet(Z)
print(squareform(cophenet(Z)))
Esempio n. 22
0
 def check_fcluster_maxclust_monocrit(self, t):
     expectedT = hierarchy_test_data.fcluster_maxclust[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z))
     assert_(is_isomorphic(T, expectedT))
Esempio n. 23
0
ax1.scatter(X[:,0], X[:,1], s=20, c=km1.labels_, cmap=plt.cm.prism) 
ax1.set_title('K-Means Clustering Results with K=2')
ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)

ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) 
ax2.set_title('K-Means Clustering Results with K=3')
ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2);

''' K means clustering '''
from scipy.cluster import hierarchy

plt.figure()

fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],[ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');


''' Lab Genes NCI60 '''

'''
Each cell line is labeled with a cancer type. We do not make use of the
cancer types in performing PCA and clustering, as these are unsupervised
techniques. But after performing PCA and clustering, we will check to
see the extent to which these cancer types agree with the results of these
unsupervised techniques.
# <codecell>

np.fill_diagonal(tag_dist_matrix, 0)

# <codecell>

from scipy.spatial.distance import squareform
tag_dist = squareform(tag_dist_matrix)

# <codecell>

from scipy.cluster.hierarchy import single

# <codecell>

tag_linkage = single(tag_dist)

# <codecell>

np.save('tag_linkage',tag_linkage)

# <codecell>

from scipy.cluster.hierarchy import leaves_list, to_tree, dendrogram

# <codecell>

tag_closeness_id = leaves_list(tag_linkage)

# <codecell>
Esempio n. 25
0
 cS = u.select_atoms('name C312')  # select the atoms
 r1 = []  # get positions
 for ts in u.trajectory:
     r1.append((cS.positions))
     #print('loading positions frame %i of %i'%(u.trajectory.frame+1, len(u.trajectory)))
 r1 = np.array(r1)
 dist = []  # get all pair distances formatted as flat upper triangles
 for i in range(len(r1)):
     dist.append(
         MDAnalysis.analysis.distances.self_distance_array(r1[i], box=box))
     #print('loading distances frame %i of %i'%(i+1, len(u.trajectory)))
 dist = np.array(dist)
 #print('clustering.')
 z = []  # perform hierarchical single-linkage clustering
 for i in range(len(dist)):
     z.append(single(dist[i]))
 z = np.array(z)
 #print('clustering..')
 hierarchy = []  # get clusters using cutoff (in angstroms)
 for i in range(len(z)):
     hierarchy.append(fcluster(z[i], cutoff, criterion='distance'))
 hierarchy = np.array(hierarchy)
 #
 # select the indices of the atoms in each cluster
 #
 DPC = u.select_atoms('resname FOS12')
 #
 clusters = []
 for j in range(-frame, -1):
     clusters1 = []
     for i in range(1, np.amax(hierarchy[j]) + 1):