def clustering(matrix, threshold, cluster_method='infomap'): if callable(cluster_method): c = external_function(threshold, matrix) elif cluster_method in [ 'infomap', 'labelprop', 'ebet', 'multilevel', 'spinglass' ]: c = igraph_clustering(matrix, threshold, method=cluster_method) elif cluster_method == 'mcl': kw = dict(max_steps=1000, inflation=2, expansion=2, add_self_loops=True, mcl_logs=lambda x: -np.log2((1 - x)**2)) c = mcl( threshold, matrix, taxa=list(range(len(matrix))), revert=True, # More mcl parameters max_steps=kw['max_steps'], inflation=kw['inflation'], expansion=kw['expansion'], add_self_loops=kw['add_self_loops'], logs=kw['mcl_logs']) elif cluster_method in ['upgma', 'single', 'complete', 'ward']: c = flat_cluster(cluster_method, threshold, [[c for c in r] for r in matrix], revert=True) else: raise ValueError( "No clustering method named {:}".format(cluster_method)) return c
def partial_cluster( self, method='sca', threshold=0.45, scale=0.5, factor=0.3, restricted_chars='_T', mode='overlap', cluster_method='infomap', gop=-1, restriction='', ref='', external_function=None, split_on_tones=True, **keywords): """ Cluster the words into partial cognate sets. Function for flat clustering of words into cognate sets. Parameters ---------- method : {'sca','lexstat','edit-dist','turchin'} (default='sca') Select the method that shall be used for the calculation. cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma') Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to average linkage clustering, 'mcl' refers to the "Markov Clustering Algorithm" (:evobib:`Dongen2000`). threshold : float (default=0.3) Select the threshold for the cluster approach. If set to c{False}, an automatic threshold will be calculated by calculating the average distance of unrelated sequences (use with care). scale : float (default=0.5) Select the scale for the gap extension penalty. factor : float (default=0.3) Select the factor for extra scores for identical prosodic segments. restricted_chars : str (default="T_") Select the restricted chars (boundary markers) in the prosodic strings in order to enable secondary alignment. mode : {'global','local','overlap','dialign'} (default='overlap') Select the mode for the alignment analysis. verbose : bool (default=False) Define whether verbose output should be used or not. gop : int (default=-2) If 'sca' is selected as a method, define the gap opening penalty. restriction : {'cv'} (default="") Specify the restriction for calculations using the edit-distance. Currently, only "cv" is supported. If *edit-dist* is selected as *method* and *restriction* is set to *cv*, consonant-vowel matches will be prohibited in the calculations and the edit distance will be normalized by the length of the alignment rather than the length of the longest sequence, as described in :evobib:`Heeringa2006`. inflation : {int, float} (default=2) Specify the inflation parameter for the use of the MCL algorithm. expansion : int (default=2) Specify the expansion parameter for the use of the MCL algorithm. """ kw = dict( imap_mode = True, post_processing = False, inflation=2, expansion=2, max_steps=1000, add_self_loops=True, sep=lingpy.settings.rcParams['morpheme_separator'], word_sep=lingpy.settings.rcParams['word_separator'], word_seps=lingpy.settings.rcParams['word_separators'], seps=lingpy.settings.rcParams['morpheme_separators'], mcl_logs=lambda x: -np.log2((1 - x) ** 2) ) kw.update(keywords) # check for parameters and add clustering, in order to make sure that # analyses are not repeated if not hasattr(self, 'params'): self.params = {} self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format( method, cluster_method, threshold) self._stamp += '# Partial Cluster: ' + self.params['partial_cluster'] matrices = self._get_partial_matrices(method=method, scale=scale, factor=factor, restricted_chars=restricted_chars, mode=mode, gop=gop, imap_mode=kw['imap_mode'], split_on_tones=split_on_tones) k = 0 C = defaultdict(list) # stores the pcogids G = {} # stores the graphs with pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress: for concept, trace, matrix in matrices: progress.update(1) lingpy.log.info('Analyzing concept {0}...'.format(concept)) if external_function: c = external_function(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'infomap': c = extra.infomap_clustering(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'mcl': c = clustering.mcl(threshold, matrix, taxa = list(range(len(matrix))), max_steps=kw['max_steps'], inflation=kw['inflation'], expansion=kw['expansion'], add_self_loops=kw['add_self_loops'], logs=kw['mcl_logs'], revert=True) elif cluster_method in ['upgma', 'single', 'complete', 'ward']: c = clustering.flat_cluster(cluster_method, threshold, matrix, revert=True) else: raise ValueError("No suitable cluster method specified.") for i,(idx,pos,slc) in enumerate(trace): C[idx] += [c[i] + k] if kw['post_processing']: _g = nx.Graph() for i,(idx,pos,slc) in enumerate(trace): _g.add_node((i,idx,pos)) remove_edges = [] for (i, n1), (j, n2) in combinations2(enumerate(_g.nodes())): if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]: _g.add_edge(n1, n2) if n1[1] == n2[1]: # get scores for n1 and n2 with all the rest in # the matrix to decide for one sn1, sn2 = 0, 0 for i,row in enumerate(matrix): sn1 += matrix[i][n1[0]] sn2 += matrix[i][n2[0]] sn1 = sn1 / len(matrix) sn2 = sn2 / len(matrix) if sn1 <= sn2: remove_edges += [n2] else: remove_edges += [n1] for node in remove_edges: for edge in sorted(_g[node]): _g.remove_edge(node, edge) for i,coms in enumerate(nx.connected_components(_g)): cogid = i + 1 + k for j,idx,pos in coms: C[idx][pos] = cogid G[concept] = _g k += max(c.values()) self.add_entries(ref or self._partials, C, lambda x: x) self.graphs = G
def partial_cluster(self, method='sca', threshold=0.45, scale=0.5, factor=0.3, restricted_chars='_T', mode='overlap', cluster_method='infomap', gop=-1, restriction='', ref='', external_function=None, split_on_tones=False, **keywords): """ Cluster the words into partial cognate sets. Function for flat clustering of words into cognate sets. Parameters ---------- method : {'sca','lexstat','edit-dist','turchin'} (default='sca') Select the method that shall be used for the calculation. cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma') Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to average linkage clustering, 'mcl' refers to the "Markov Clustering Algorithm" (:evobib:`Dongen2000`). threshold : float (default=0.3) Select the threshold for the cluster approach. If set to c{False}, an automatic threshold will be calculated by calculating the average distance of unrelated sequences (use with care). scale : float (default=0.5) Select the scale for the gap extension penalty. factor : float (default=0.3) Select the factor for extra scores for identical prosodic segments. restricted_chars : str (default="T_") Select the restricted chars (boundary markers) in the prosodic strings in order to enable secondary alignment. mode : {'global','local','overlap','dialign'} (default='overlap') Select the mode for the alignment analysis. verbose : bool (default=False) Define whether verbose output should be used or not. gop : int (default=-2) If 'sca' is selected as a method, define the gap opening penalty. restriction : {'cv'} (default="") Specify the restriction for calculations using the edit-distance. Currently, only "cv" is supported. If *edit-dist* is selected as *method* and *restriction* is set to *cv*, consonant-vowel matches will be prohibited in the calculations and the edit distance will be normalized by the length of the alignment rather than the length of the longest sequence, as described in :evobib:`Heeringa2006`. inflation : {int, float} (default=2) Specify the inflation parameter for the use of the MCL algorithm. expansion : int (default=2) Specify the expansion parameter for the use of the MCL algorithm. """ kw = dict(imap_mode=True, post_processing=True, inflation=2, expansion=2, max_steps=1000, add_self_loops=True, sep=lingpy.settings.rcParams['morpheme_separator'], word_sep=lingpy.settings.rcParams['word_separator'], word_seps=lingpy.settings.rcParams['word_separators'], seps=lingpy.settings.rcParams['morpheme_separators'], mcl_logs=lambda x: -np.log2((1 - x)**2)) kw.update(keywords) # check for parameters and add clustering, in order to make sure that # analyses are not repeated if not hasattr(self, 'params'): self.params = {} self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format( method, cluster_method, threshold) self._stamp += '# Partial Cluster: ' + self.params['partial_cluster'] matrices = self._get_partial_matrices( method=method, scale=scale, factor=factor, restricted_chars=restricted_chars, mode=mode, gop=gop, imap_mode=kw['imap_mode'], split_on_tones=split_on_tones) k = 0 C = defaultdict(list) # stores the pcogids G = {} # stores the graphs with util.pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress: for concept, trace, matrix in matrices: progress.update(1) lingpy.log.info('Analyzing concept {0}...'.format(concept)) if external_function: c = external_function(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'infomap': c = extra.infomap_clustering(threshold, matrix, taxa=list(range(len(matrix))), revert=True) elif cluster_method == 'mcl': c = clustering.mcl(threshold, matrix, taxa=list(range(len(matrix))), max_steps=kw['max_steps'], inflation=kw['inflation'], expansion=kw['expansion'], add_self_loops=kw['add_self_loops'], logs=kw['mcl_logs'], revert=True) elif cluster_method in ['upgma', 'single', 'complete', 'ward']: c = clustering.flat_cluster(cluster_method, threshold, matrix, revert=True) else: raise ValueError("No suitable cluster method specified.") for i, (idx, pos, slc) in enumerate(trace): C[idx] += [c[i] + k] if kw['post_processing']: _g = nx.Graph() for i, (idx, pos, slc) in enumerate(trace): _g.add_node((i, idx, pos)) remove_edges = [] for (i, n1), (j, n2) in util.combinations2( enumerate(_g.nodes())): if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]: _g.add_edge(n1, n2) if n1[1] == n2[1]: # get scores for n1 and n2 with all the rest in # the matrix to decide for one sn1, sn2 = 0, 0 for i, row in enumerate(matrix): sn1 += matrix[i][n1[0]] sn2 += matrix[i][n2[0]] sn1 = sn1 / len(matrix) sn2 = sn2 / len(matrix) if sn1 <= sn2: remove_edges += [n2] else: remove_edges += [n1] for node in remove_edges: for edge in sorted(_g[node]): _g.remove_edge(node, edge) for i, coms in enumerate(nx.connected_components(_g)): cogid = i + 1 + k for j, idx, pos in coms: C[idx][pos] = cogid G[concept] = _g k += len(matrix) + 1 self.add_entries(ref or self._partials, C, lambda x: x) self.graphs = G
dists = np.zeros((len(embeddings), len(embeddings))) for u, emb_u in enumerate(embeddings): print(u, "/", len(embeddings)) for v, emb_v in enumerate(embeddings): print(u, "/", len(embeddings), " - ", v, "/", len(embeddings)) dists[u, v] = euclidean(u, v) for threshold in [ 0.9, 0.8500000000000001, 0.8, 0.75, 0.7000000000000001, 0.65, 0.6000000000000001, 0.55, 0.5, 0.45, 0.4, 0.35000000000000003, 0.30000000000000004, 0.25, 0.2, 0.15000000000000002, 0.1, 0.05 ]: cluster2ids = flat_cluster("upgma", threshold=threshold, matrix=dists) y_pred = np.zeros(len(X)) for cluster in cluster2ids: y_pred[cluster2ids[cluster]] = cluster #y_pred = ap.fit_predict(embeddings) n_cognate_classes = len(set(cognate_classes)) n_concepts = len(set(global_ids)) y_true = cognate_classes y_random = np.random.randint(0, int(n_cognate_classes / n_concepts), y_pred.shape) from pairwise_evaluation import PairwiseEvaluation pe = PairwiseEvaluation(X, y_true, y_pred) precision, recall, f1 = pe.getPrecisionRecallF1() print("--------------------------") print(threshold) print(metrics.adjusted_rand_score(y_true, y_pred))
def test_flat_cluster(self): for method in ['upgma', 'single', 'complete', 'ward']: flat_cluster(method, 0.5, self.matrix, self.taxa, revert=True) flat_cluster(method, 0.5, self.matrix, self.taxa, revert=False) flat_cluster(method, 0.5, self.matrix, False, revert=False)
concepts2embeddings = dict((concept,[emb for i,emb in enumerate(embeddings) if global_ids[i] == concept]) for concept in set(sorted(global_ids))) concepts2cognate_classes = dict((concept,[cog for i,cog in enumerate(cognate_classes) if global_ids[i] == concept]) for concept in set(sorted(global_ids))) #for damping_factor in np.arange(0.5,1,0.05): from lingpy.algorithm.clustering import flat_cluster dists = np.zeros((len(embeddings),len(embeddings))) for u,emb_u in enumerate(embeddings): print(u,"/",len(embeddings)) for v,emb_v in enumerate(embeddings): print(u,"/",len(embeddings)," - ",v,"/",len(embeddings)) dists[u,v] =euclidean(u,v) for threshold in [0.9, 0.8500000000000001, 0.8, 0.75, 0.7000000000000001, 0.65, 0.6000000000000001, 0.55, 0.5, 0.45, 0.4, 0.35000000000000003, 0.30000000000000004, 0.25, 0.2, 0.15000000000000002, 0.1, 0.05]: cluster2ids = flat_cluster("upgma", threshold=threshold, matrix=dists) y_pred = np.zeros(len(X)) for cluster in cluster2ids: y_pred[cluster2ids[cluster]]=cluster #y_pred = ap.fit_predict(embeddings) n_cognate_classes = len(set(cognate_classes)) n_concepts = len(set(global_ids)) y_true = cognate_classes y_random = np.random.randint(0,int(n_cognate_classes/n_concepts),y_pred.shape) from pairwise_evaluation import PairwiseEvaluation pe = PairwiseEvaluation(X,y_true,y_pred) precision,recall,f1 = pe.getPrecisionRecallF1() print("--------------------------") print(threshold) print(metrics.adjusted_rand_score(y_true, y_pred)) print(metrics.adjusted_mutual_info_score(y_true, y_pred))
def test_flat_cluster(self): from lingpy.algorithm.clustering import flat_cluster for method in ['upgma', 'single', 'complete', 'ward']: flat_cluster(method, 0.5, self.matrix, self.taxa, revert=True) flat_cluster(method, 0.5, self.matrix, self.taxa, revert=False) flat_cluster(method, 0.5, self.matrix, False, revert=False)