def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'): """ Helper function computes colexifications for a given set of languages in a wordlist. """ if family not in wordlist.header: family = 'doculect' taxa = wordlist.cols colexifications = [] for taxon in taxa: log.info('Analyzing taxon {0}...'.format(taxon)) tmp_idxs = wordlist.get_list(taxon=taxon, flat=True) tmp_family = wordlist[tmp_idxs[0], family] tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept) tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry) # iterate over all concepts and add them to the graph for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)): if tmp_entries[i] == tmp_entries[j] and c1 != c2: colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i]) ] return colexifications
def add_cognate_ids(self, source, target, idtype='strict', override=False): """ Compute normal cognate identifiers from partial cognate sets. Parameters ---------- source: str Name of the source column in your wordlist file. target : str Name of the target column in your wordlist file. idtype : str (default="strict") Select between "strict" and "loose". override: bool (default=False) Specify whether you want to override existing columns. Notes ----- While the computation of strict cognate IDs from partial cognate IDs is straightforward and just judges those words as cognate which are identical in all their parts, the computation of loose cognate IDs constructs a network between all words, draws lines between all words that share a common morpheme, and judges all connected components in this network as cognate. """ if idtype == 'strict': tmp = defaultdict(list) for k in self._data: tmp[tuple(self[k, source])] += [k] idx = 1 D = {} for vals in tmp.values(): for k in vals: D[k] = idx idx += 1 self.add_entries(target, D, lambda x: x, override=override) elif idtype == 'loose': D = {} idx = 1 for c in self.rows: idxs = self.get_list(row=c, flat=True) srcs = [self[k, source] for k in idxs] # get connected components g = nx.Graph() g.add_nodes_from(idxs) for (i, cogsA), (j, cogsB) in util.combinations2(zip(idxs, srcs)): if [x for x in cogsA if x in cogsB]: g.add_edge(i, j) for i, comps in enumerate(nx.connected_components(g)): for comp in comps: D[comp] = idx + i idx += (i + 1) self.add_entries(target, D, lambda x: x, override=override) else: raise ValueError("The value you selected is not available.")
def add_cognate_ids(self, source, target, idtype='strict', override=False): """ Compute normal cognate identifiers from partial cognate sets. Parameters ---------- source: str Name of the source column in your wordlist file. target : str Name of the target column in your wordlist file. idtype : str (default="strict") Select between "strict" and "loose". override: bool (default=False) Specify whether you want to override existing columns. Notes ----- While the computation of strict cognate IDs from partial cognate IDs is straightforward and just judges those words as cognate which are identical in all their parts, the computation of loose cognate IDs constructs a network between all words, draws lines between all words that share a common morpheme, and judges all connected components in this network as cognate. """ if idtype == 'strict': tmp = defaultdict(list) for k in self._data: tmp[tuple(self[k, source])] += [k] idx = 1 D = {} for vals in tmp.values(): for k in vals: D[k] = idx idx += 1 self.add_entries(target, D, lambda x: x, override=override) elif idtype == 'loose': D = {} idx = 1 for c in self.rows: idxs = self.get_list(row=c, flat=True) srcs = [self[k, source] for k in idxs] # get connected components g = nx.Graph() g.add_nodes_from(idxs) for (i, cogsA), (j, cogsB) in combinations2(zip(idxs, srcs)): if [x for x in cogsA if x in cogsB]: g.add_edge(i, j) for i,comps in enumerate(nx.connected_components(g)): for comp in comps: D[comp] = idx + i idx += (i+1) self.add_entries(target, D, lambda x: x, override=override) else: raise ValueError("The value you selected is not available.")
def test_combinations2(): def f(l): for i, a1 in enumerate(l): for j, a2 in enumerate(l): if i < j: yield a1, a2 def fm(l): for i, a1 in enumerate(l): for j, a2 in enumerate(l): if i <= j: yield a1, a2 for ch in [list(range(5)), 'abcdefg']: assert list(util.combinations2(ch)) == list(f(ch)) assert list(util.multicombinations2(ch)) == list(fm(ch))
def test_combinations2(self): def f(l): for i, a1 in enumerate(l): for j, a2 in enumerate(l): if i < j: yield a1, a2 def fm(l): for i, a1 in enumerate(l): for j, a2 in enumerate(l): if i <= j: yield a1, a2 for l in [list(range(5)), 'abcdefg']: self.assertEqual(list(util.combinations2(l)), list(f(l))) self.assertEqual(list(util.multicombinations2(l)), list(fm(l)))
def _make_matrix(taxa, colex): """ Take colexification data and use it to create a distance matrix. Notes ----- "colex" is a dictionary with taxon names as keys and colexification data in form of tuples of concepts, not necessarily ordered, in both directions, as values. """ # calculate the matrix matrix = [[0 for i in range(len(colex))] for j in range(len(colex))] for (i, t1), (j, t2) in combinations2(enumerate(taxa)): intersection = colex[t1].intersection(colex[t2]) union = colex[t1].union(colex[t2]) matrix[i][j] = matrix[j][i] = 1 - len(intersection) / len(union) return matrix
def _make_matrix(taxa, colex): """ Take colexification data and use it to create a distance matrix. Note ---- "colex" is a dictionary with taxon names as keys and colexification data in form of tuples of concepts, not necessarily ordered, in both directions, as values. """ # calculate the matrix matrix = [[0 for i in range(len(colex))] for j in range(len(colex))] for (i, t1), (j, t2) in combinations2(enumerate(taxa)): intersection = colex[t1].intersection(colex[t2]) union = colex[t1].union(colex[t2]) matrix[i][j] = matrix[j][i] = 1 - len(intersection) / len(union) return matrix
def _get_wad(matrix, threshold, use_log=False): """ Get weighted average degree. """ def log_f(x): return -np.log(1 - x) if use_log else x degreeDict = defaultdict(list) for i, j in util.combinations2(range(len(matrix))): score = matrix[i][j] if score < threshold: deg = log_f(score) degreeDict[i].append(deg) degreeDict[j].append(deg) deg_sum = 0 for weights in degreeDict.values(): deg = sum(weights) deg_sum += deg if degreeDict: return deg_sum / len(degreeDict)
def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'): """ Helper function computes colexifications for a given set of languages in a wordlist. """ if family not in wordlist.header: family = 'doculect' taxa = wordlist.cols colexifications = [] for taxon in taxa: log.info('Analyzing taxon {0}...'.format(taxon)) tmp_idxs = wordlist.get_list(taxon=taxon, flat=True) tmp_family = wordlist[tmp_idxs[0], family] tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept) tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry) # iterate over all concepts and add them to the graph for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)): if tmp_entries[i] == tmp_entries[j] and c1 != c2: colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])] return colexifications
def partial_cluster( self, method='sca', threshold=0.45, scale=0.5, factor=0.3, restricted_chars='_T', mode='overlap', cluster_method='infomap', gop=-1, restriction='', ref='', external_function=None, split_on_tones=True, **keywords): """ Cluster the words into partial cognate sets. Function for flat clustering of words into cognate sets. Parameters ---------- method : {'sca','lexstat','edit-dist','turchin'} (default='sca') Select the method that shall be used for the calculation. cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma') Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to average linkage clustering, 'mcl' refers to the "Markov Clustering Algorithm" (:evobib:`Dongen2000`). threshold : float (default=0.3) Select the threshold for the cluster approach. If set to c{False}, an automatic threshold will be calculated by calculating the average distance of unrelated sequences (use with care). scale : float (default=0.5) Select the scale for the gap extension penalty. factor : float (default=0.3) Select the factor for extra scores for identical prosodic segments. restricted_chars : str (default="T_") Select the restricted chars (boundary markers) in the prosodic strings in order to enable secondary alignment. mode : {'global','local','overlap','dialign'} (default='overlap') Select the mode for the alignment analysis. verbose : bool (default=False) Define whether verbose output should be used or not. gop : int (default=-2) If 'sca' is selected as a method, define the gap opening penalty. restriction : {'cv'} (default="") Specify the restriction for calculations using the edit-distance. Currently, only "cv" is supported. If *edit-dist* is selected as *method* and *restriction* is set to *cv*, consonant-vowel matches will be prohibited in the calculations and the edit distance will be normalized by the length of the alignment rather than the length of the longest sequence, as described in :evobib:`Heeringa2006`. inflation : {int, float} (default=2) Specify the inflation parameter for the use of the MCL algorithm. expansion : int (default=2) Specify the expansion parameter for the use of the MCL algorithm. """ kw = dict( imap_mode = True, post_processing = False, inflation=2, expansion=2, max_steps=1000, add_self_loops=True, sep=lingpy.settings.rcParams['morpheme_separator'], word_sep=lingpy.settings.rcParams['word_separator'], word_seps=lingpy.settings.rcParams['word_separators'], seps=lingpy.settings.rcParams['morpheme_separators'], mcl_logs=lambda x: -np.log2((1 - x) ** 2) ) kw.update(keywords) # check for parameters and add clustering, in order to make sure that # analyses are not repeated if not hasattr(self, 'params'): self.params = {} self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format( method, cluster_method, threshold) self._stamp += '# Partial Cluster: ' + self.params['partial_cluster'] matrices = self._get_partial_matrices(method=method, scale=scale, factor=factor, restricted_chars=restricted_chars, mode=mode, gop=gop, imap_mode=kw['imap_mode'], split_on_tones=split_on_tones) k = 0 C = defaultdict(list) # stores the pcogids G = {} # stores the graphs with pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress: for concept, trace, matrix in matrices: progress.update(1) lingpy.log.info('Analyzing concept {0}...'.format(concept)) if external_function: c = external_function(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'infomap': c = extra.infomap_clustering(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'mcl': c = clustering.mcl(threshold, matrix, taxa = list(range(len(matrix))), max_steps=kw['max_steps'], inflation=kw['inflation'], expansion=kw['expansion'], add_self_loops=kw['add_self_loops'], logs=kw['mcl_logs'], revert=True) elif cluster_method in ['upgma', 'single', 'complete', 'ward']: c = clustering.flat_cluster(cluster_method, threshold, matrix, revert=True) else: raise ValueError("No suitable cluster method specified.") for i,(idx,pos,slc) in enumerate(trace): C[idx] += [c[i] + k] if kw['post_processing']: _g = nx.Graph() for i,(idx,pos,slc) in enumerate(trace): _g.add_node((i,idx,pos)) remove_edges = [] for (i, n1), (j, n2) in combinations2(enumerate(_g.nodes())): if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]: _g.add_edge(n1, n2) if n1[1] == n2[1]: # get scores for n1 and n2 with all the rest in # the matrix to decide for one sn1, sn2 = 0, 0 for i,row in enumerate(matrix): sn1 += matrix[i][n1[0]] sn2 += matrix[i][n2[0]] sn1 = sn1 / len(matrix) sn2 = sn2 / len(matrix) if sn1 <= sn2: remove_edges += [n2] else: remove_edges += [n1] for node in remove_edges: for edge in sorted(_g[node]): _g.remove_edge(node, edge) for i,coms in enumerate(nx.connected_components(_g)): cogid = i + 1 + k for j,idx,pos in coms: C[idx][pos] = cogid G[concept] = _g k += max(c.values()) self.add_entries(ref or self._partials, C, lambda x: x) self.graphs = G
def partial_cluster(self, method='sca', threshold=0.45, scale=0.5, factor=0.3, restricted_chars='_T', mode='overlap', cluster_method='infomap', gop=-1, restriction='', ref='', external_function=None, split_on_tones=False, **keywords): """ Cluster the words into partial cognate sets. Function for flat clustering of words into cognate sets. Parameters ---------- method : {'sca','lexstat','edit-dist','turchin'} (default='sca') Select the method that shall be used for the calculation. cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma') Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to average linkage clustering, 'mcl' refers to the "Markov Clustering Algorithm" (:evobib:`Dongen2000`). threshold : float (default=0.3) Select the threshold for the cluster approach. If set to c{False}, an automatic threshold will be calculated by calculating the average distance of unrelated sequences (use with care). scale : float (default=0.5) Select the scale for the gap extension penalty. factor : float (default=0.3) Select the factor for extra scores for identical prosodic segments. restricted_chars : str (default="T_") Select the restricted chars (boundary markers) in the prosodic strings in order to enable secondary alignment. mode : {'global','local','overlap','dialign'} (default='overlap') Select the mode for the alignment analysis. verbose : bool (default=False) Define whether verbose output should be used or not. gop : int (default=-2) If 'sca' is selected as a method, define the gap opening penalty. restriction : {'cv'} (default="") Specify the restriction for calculations using the edit-distance. Currently, only "cv" is supported. If *edit-dist* is selected as *method* and *restriction* is set to *cv*, consonant-vowel matches will be prohibited in the calculations and the edit distance will be normalized by the length of the alignment rather than the length of the longest sequence, as described in :evobib:`Heeringa2006`. inflation : {int, float} (default=2) Specify the inflation parameter for the use of the MCL algorithm. expansion : int (default=2) Specify the expansion parameter for the use of the MCL algorithm. """ kw = dict(imap_mode=True, post_processing=True, inflation=2, expansion=2, max_steps=1000, add_self_loops=True, sep=lingpy.settings.rcParams['morpheme_separator'], word_sep=lingpy.settings.rcParams['word_separator'], word_seps=lingpy.settings.rcParams['word_separators'], seps=lingpy.settings.rcParams['morpheme_separators'], mcl_logs=lambda x: -np.log2((1 - x)**2)) kw.update(keywords) # check for parameters and add clustering, in order to make sure that # analyses are not repeated if not hasattr(self, 'params'): self.params = {} self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format( method, cluster_method, threshold) self._stamp += '# Partial Cluster: ' + self.params['partial_cluster'] matrices = self._get_partial_matrices( method=method, scale=scale, factor=factor, restricted_chars=restricted_chars, mode=mode, gop=gop, imap_mode=kw['imap_mode'], split_on_tones=split_on_tones) k = 0 C = defaultdict(list) # stores the pcogids G = {} # stores the graphs with util.pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress: for concept, trace, matrix in matrices: progress.update(1) lingpy.log.info('Analyzing concept {0}...'.format(concept)) if external_function: c = external_function(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'infomap': c = extra.infomap_clustering(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'mcl': c = clustering.mcl(threshold, matrix, taxa=list(range(len(matrix))), max_steps=kw['max_steps'], inflation=kw['inflation'], expansion=kw['expansion'], add_self_loops=kw['add_self_loops'], logs=kw['mcl_logs'], revert=True) elif cluster_method in ['upgma', 'single', 'complete', 'ward']: c = clustering.flat_cluster(cluster_method, threshold, matrix, revert=True) else: raise ValueError("No suitable cluster method specified.") for i, (idx, pos, slc) in enumerate(trace): C[idx] += [c[i] + k] if kw['post_processing']: _g = nx.Graph() for i, (idx, pos, slc) in enumerate(trace): _g.add_node((i, idx, pos)) remove_edges = [] for (i, n1), (j, n2) in util.combinations2( enumerate(_g.nodes())): if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]: _g.add_edge(n1, n2) if n1[1] == n2[1]: # get scores for n1 and n2 with all the rest in # the matrix to decide for one sn1, sn2 = 0, 0 for i, row in enumerate(matrix): sn1 += matrix[i][n1[0]] sn2 += matrix[i][n2[0]] sn1 = sn1 / len(matrix) sn2 = sn2 / len(matrix) if sn1 <= sn2: remove_edges += [n2] else: remove_edges += [n1] for node in remove_edges: for edge in sorted(_g[node]): _g.remove_edge(node, edge) for i, coms in enumerate(nx.connected_components(_g)): cogid = i + 1 + k for j, idx, pos in coms: C[idx][pos] = cogid G[concept] = _g k += len(matrix) + 1 self.add_entries(ref or self._partials, C, lambda x: x) self.graphs = G
def partition_density(matrix, t): """ Calculate partition density for a given threshold on a distance matrix. Notes ----- See :evobib:`Ahn2012` for details on the calculation of partition density in a given network. """ # compute cutoff for matrix at t m = np.zeros((len(matrix), len(matrix))) for i, j in util.combinations2(range(len(matrix))): if matrix[i][j] < t: m[j][i] = m[i][j] = 1 # get the total number of links T = sum(m.flatten()) / 2 # get connected components nodes = list(range(len(m))) idx = 1 parts = [0 for i in range(len(m))] for i, j in util.combinations2(range(len(m))): if m[i][j] == 1: if parts[i] == parts[j] and parts[i] != 0: pass else: # most complicated, update all the stuff if parts[i] > 0 and parts[j] > 0: # determine best idx if parts[i] > parts[j]: this = parts[j] other = parts[i] else: this = parts[i] other = parts[j] # find all neighbors of the idxs = [n for n in nodes if parts[n] == other] for n in idxs: parts[n] = this elif parts[i] == 0 and parts[j] == 0: parts[i] = idx parts[j] = idx idx += 1 elif parts[i] > 0: parts[j] = parts[i] elif parts[j] > 0: parts[i] = parts[j] # finish unconnected components for i, p in enumerate(parts): if p == 0: parts[i] = max(parts) + 1 # convert to dictionary components = sorted(set(parts)) # return zero, if all components are different if len(components) == len(m): return 0.0, len(components) # count density D = 0 for part in components: # get nodes nodes = [n for n in range(len(parts)) if parts[n] == part] # get edges edges = 0 for i, j in util.combinations2(range(len(nodes))): if m[nodes[i]][nodes[j]] == 1: edges += 1 N = len(nodes) M = edges # calculate sum formula x = 1 try: t = M * (M - (N - x)) / ((N - 1 + x) * (N - x)) D += t except ZeroDivisionError: pass return 2 / T * D, len(components)
def mcl(threshold, matrix, taxa, max_steps=1000, inflation=2, expansion=2, add_self_loops=True, revert=False, logs=True, matrix_type="distances"): """ Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`). Parameters ---------- threshold : {float, bool} The threshold that shall be used for the initial selection of links assigned to the data. If set to c{False}, the weights from the matrix will be used directly. matrix : list A two-dimensional list containing the distances. taxa : list An list containing the names of all taxa corresponding to the distances in the matrix. max_steps : int (default=1000) Maximal number of iterations. inflation : int (default=2) Inflation parameter for the MCL algorithm. expansion : int (default=2) Expansion parameter of the MCL algorithm. add_self_loops : {True, False, builtins.function} (default=True) Determine whether self-loops should be added, and if so, how they should be weighted. If a function for the calculation of self-loops is given, it will take the whole column of the matrix for each taxon as input. logs : { bool, function } (default=True) If set to c{True}, the logarithm of the score beyond the threshold will be assigned as weight to the graph. If set to c{False} all weights will be set to 1. Use a custom function to define individual ways to calculate the weights. matrix_type : { "distances", "similarities" } Specify the type of the matrix. If the matrix contains distance data, it will be adapted to similarity data. If it contains "similarities", no adaptation is needed. Examples -------- The function is automatically imported along with LingPy. >>> from lingpy import * >>> from lingpy.algorithm import squareform Create a list of arbitrary taxa. >>> taxa = ['German','Swedish','Icelandic','English','Dutch'] Create an arbitrary distance matrix. >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3]) >>> matrix [[0.0, 0.5, 0.67, 0.8, 0.2], [0.5, 0.0, 0.4, 0.7, 0.6], [0.67, 0.4, 0.0, 0.8, 0.8], [0.8, 0.7, 0.8, 0.0, 0.3], [0.2, 0.6, 0.8, 0.3, 0.0]] Carry out the link-clustering analysis. >>> mcl(0.5,matrix,taxa) {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']} """ # check for type of matrix if type(matrix) != np.ndarray: imatrix = np.array(matrix) else: imatrix = matrix.copy() # check for matrix type and decide how to handle logs if matrix_type == 'distances': evaluate = lambda x: True if x < threshold else False if logs == True: logs = lambda x: -np.log2((1 - x)**2) elif logs == False: logs = lambda x: x elif matrix_type == 'similarities': evaluate = lambda x: True if x > threshold else False if logs == True: logs = lambda x: -np.log(x**2) else: logs = lambda x: x else: raise ValueError(matrix_type) # check for threshold if threshold: for i, j in util.combinations2(range(len(imatrix))): score = imatrix[i][j] evaluation = logs(score) if evaluate(score) else 0 imatrix[i][j] = evaluation imatrix[j][i] = evaluation # check for self_loops if add_self_loops == True: for i in range(len(imatrix)): imatrix[i][i] = 1 elif add_self_loops == False: pass else: for i in range(len(imatrix)): imatrix[i][i] = add_self_loops(imatrix[:, i]) # normalize the matrix imatrix = _normalize_matrix(imatrix) # start looping and the like steps = 0 while True: # expansion imatrix = np.linalg.matrix_power(imatrix, expansion) # inflation imatrix = imatrix**inflation # normalization imatrix = _normalize_matrix(imatrix) # increase steps steps += 1 # check for matrix convergence if steps >= max_steps or _is_idempotent(imatrix): log.debug("Number of steps {0}.".format(steps)) break # retrieve the clusters clusters = _interprete_matrix(imatrix) # modify clusters if revert: return dict(zip(range(len(taxa)), clusters)) clr = defaultdict(list) for i, t in enumerate(taxa): clr[clusters[i]].append(t) return clr
def link_clustering(threshold, matrix, taxa, link_threshold=False, revert=False, matrix_type="distances", fuzzy=True): """ Carry out a link clustering analysis using the method by :evobib:`Ahn2010`. Parameters ---------- threshold : {float, bool} The threshold that shall be used for the initial selection of links assigned to the data. If set to c{False}, the weights from the matrix will be used directly. matrix : list A two-dimensional list containing the distances. taxa : list An list containing the names of all taxa corresponding to the distances in the matrix. link_threshold : float (default=0.5) The threshold that shall be used for the internal clustering of the data. matrix_type : {"distances","similarities","weights"} (default="distances") Specify the type of the matrix. If the matrix contains distance data, it will be adapted to similarity data. If it contains "similarities", no adaptation is needed. If it contains "weights", a weighted version of link clustering (see the supplementary in :evobib:`Ahn2010` for details) ]will be carried out. Returns ------- cluster : dict A dictionary with cluster-IDs as keys and a list as value, containing the taxa that are assigned to a given cluster-ID. Examples -------- The function is automatically imported along with LingPy. >>> from lingpy import * >>> from lingpy.algorithm import squareform Create a list of arbitrary taxa. >>> taxa = ['German','Swedish','Icelandic','English','Dutch'] Create an arbitrary distance matrix. >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3]) >>> matrix [[0.0, 0.5, 0.67, 0.8, 0.2], [0.5, 0.0, 0.4, 0.7, 0.6], [0.67, 0.4, 0.0, 0.8, 0.8], [0.8, 0.7, 0.8, 0.0, 0.3], [0.2, 0.6, 0.8, 0.3, 0.0]] Carry out the link-clustering analysis. >>> link_clustering(0.5,matrix,taxa) {1: ['Dutch', 'English', 'German'], 2: ['Icelandic', 'Swedish']} See also -------- fuzzy """ # check for matrix type if matrix_type == 'distances': evaluate = lambda x: x < threshold elif matrix_type == 'similarities': evaluate = lambda x: x > threshold elif matrix_type == 'weights': evaluate = lambda x: False else: raise ValueError(matrix_type) # get the edges and the adjacency from the thresholds edges = set() adjacency = dict([(t, set()) for t in taxa]) weights = {} for i, j in util.combinations2(range(len(taxa))): taxA, taxB = taxa[i], taxa[j] if evaluate(matrix[i][j]): edges.add((taxA, taxB)) adjacency[taxA].add(taxB) adjacency[taxB].add(taxA) elif matrix_type == 'weights': if matrix[i][j] < threshold: edges.add((taxA, taxB)) adjacency[taxA].add(taxB) adjacency[taxB].add(taxA) edges.add((taxB, taxA)) weights[taxA, taxB] = -np.log2((1 - matrix[i][j])**2) weights[taxB, taxA] = -np.log2((1 - matrix[i][j])**2) weights = weights or None if edges: # initialize the HLC object hlc = lc.HLC(adjacency, edges) else: # check for null edges: if they occur, return the clusters directly if revert: if fuzzy: return {a: [b] for a, b in zip(taxa, range(len(taxa)))} else: return {a: b for a, b in zip(taxa, range(len(taxa)))} else: if fuzzy: return {a: [b] for a, b in zip(range(len(taxa)), taxa)} else: return {a: b for a, b in zip(range(len(taxa)), taxa)} # carry out the analyses using defaults for the clustering edge2cid = hlc.single_linkage(threshold=link_threshold, w=weights)[0] # retrieve all clusterings for the nodes # retrieve the data clr2nodes = defaultdict(list) clr2edges = defaultdict(list) # count the links of for edge, idx in edge2cid.items(): nodeA, nodeB = edge[0], edge[1] clr2edges[idx].append(edge) clr2nodes[idx].extend([nodeA, nodeB]) for idx in clr2nodes: clr2nodes[idx] = sorted(set(clr2nodes[idx])) # delete all clusters that appear as subsets of larger clusters delis = set() for keyA, keyB in util.product2(sorted(clr2nodes)): if keyA != keyB: valsA = set(clr2nodes[keyA]) valsB = set(clr2nodes[keyB]) if valsA != valsB: if valsA.issubset(valsB): delis.add(keyA) elif valsB.issubset(valsA): delis.add(keyB) elif valsA == valsB: delis.add(keyB) for k in delis: del clr2nodes[k] # renumber the data mapper = dict(zip(clr2nodes.keys(), range(1, len(clr2nodes) + 1))) out = {} found = [] for idx in clr2nodes: out[mapper[idx]] = clr2nodes[idx] found += clr2nodes[idx] missing = [f for f in taxa if f not in found] idx = max(out.keys()) + 1 for m in missing: out[idx] = [m] idx += 1 # determine weights for communities to edges node_weights = dict([(t, defaultdict(int)) for t in taxa]) for c, e in clr2edges.items(): for nA, nB in e: if c in mapper: this_c = mapper[c] node_weights[nA][this_c] += 1 node_weights[nB][this_c] += 1 # revert stuff first cluster = dict([(t, []) for t in taxa]) for idx in out: for t in out[idx]: cluster[t] += [idx] # weight membership of nodes and assign to most prominent community if not fuzzy: new_cluster = {} for t, clr in cluster.items(): weighted = sorted(clr, key=lambda x: node_weights[t][x] if x in node_weights[t] else 0, reverse=True) new_cluster[t] = weighted[0] if revert: return {taxa.index(t): c for t, c in new_cluster.items()} out = {c: [] for c in set(new_cluster.values())} for t, c in new_cluster.items(): out[c].append(t) return out if not revert: return out cluster = {t: [] for t in taxa} for idx in out: for t in out[idx]: cluster[t].append(idx) return cluster
def fuzzy(threshold, matrix, taxa, method='upgma', revert=False): """ Create fuzzy cluster of a given distance matrix. Parameters ---------- threshold : float The threshold that shall be used for the basic clustering of the data. matrix : list A two-dimensional list containing the distances. taxa : list An list containing the names of all taxa corresponding to the distances in the matrix. method : { "upgma", "single", "complete" } (default="upgma") Select the method for the flat cluster analysis. distances : bool If set to "False", only the topology of the tree will be returned. revert : bool (default=False) Specify whether a reverted dictionary should be returned. Returns ------- cluster : dict A dictionary with cluster-IDs as keys and a list as value, containing the taxa that are assigned to a given cluster-ID. Examples -------- The function is automatically imported along with LingPy. >>> from lingpy import * from lingpy.algorithm import squareform Create a list of arbitrary taxa. >>> taxa = ['German','Swedish','Icelandic','English','Dutch'] Create an arbitrary distance matrix. >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3]) >>> matrix [[0.0, 0.5, 0.67, 0.8, 0.2], [0.5, 0.0, 0.4, 0.7, 0.6], [0.67, 0.4, 0.0, 0.8, 0.8], [0.8, 0.7, 0.8, 0.0, 0.3], [0.2, 0.6, 0.8, 0.3, 0.0]] Carry out the fuzzy flat cluster analysis. >>> fuzzy(0.5,matrix,taxa) {1: ['Swedish', 'Icelandic'], 2: ['Dutch', 'German'], 3: ['Dutch', 'English']} Notes ----- This is a very simple fuzzy clustering algorithm. It basically does nothing else than removing taxa successively from the matrix, flat-clustering the remaining taxa with the corresponding threshold, and then returning a combined "consensus" cluster in which taxa may be assigned to multiple clusters. See also -------- link_clustering """ g = nx.Graph() for taxon in taxa: g.add_node(taxon) for idx, taxon in enumerate(taxa): new_matrix = [] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j and i != idx and j != idx: new_matrix += [cell] new_matrix = misc.squareform(new_matrix) clusters = cluster.flat_cluster(method, threshold, new_matrix, [t for t in taxa if t != taxon]) for clr in clusters: for tA, tB in util.combinations2(clusters[clr]): if not g.has_edge(tA, tB): g.add_edge(tA, tB, weight=1) else: g.edge[tA][tB]['weight'] += 1 out = {i + 1: c for i, c in enumerate(nx.find_cliques(g))} if revert: new_out = defaultdict(list) for key, val in out.items(): for v in val: new_out[v].append(key) return new_out return out