Esempio n. 1
0
def _get_colexifications(wordlist,
                         entry='ipa',
                         concept='concept',
                         family='family'):
    """
    Helper function computes colexifications for a given set of languages in a
    wordlist.
    """
    if family not in wordlist.header:
        family = 'doculect'

    taxa = wordlist.cols
    colexifications = []
    for taxon in taxa:
        log.info('Analyzing taxon {0}...'.format(taxon))

        tmp_idxs = wordlist.get_list(taxon=taxon, flat=True)
        tmp_family = wordlist[tmp_idxs[0], family]
        tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept)
        tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry)

        # iterate over all concepts and add them to the graph
        for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)):
            if tmp_entries[i] == tmp_entries[j] and c1 != c2:
                colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])
                                    ]

    return colexifications
Esempio n. 2
0
    def add_cognate_ids(self, source, target, idtype='strict', override=False):
        """
        Compute normal cognate identifiers from partial cognate sets.

        Parameters
        ----------
        source: str
            Name of the source column in your wordlist file.
        target : str
            Name of the target column in your wordlist file.
        idtype : str (default="strict")
            Select between "strict" and "loose".
        override: bool (default=False)
            Specify whether you want to override existing columns.
        
        Notes
        -----
        While the computation of strict cognate IDs from partial cognate IDs is
        straightforward and just judges those words as cognate which are
        identical in all their parts, the computation of loose cognate IDs
        constructs a network between all words, draws lines between all words
        that share a common morpheme, and judges all connected components in this
        network as cognate.
        """
        if idtype == 'strict':

            tmp = defaultdict(list)
            for k in self._data:
                tmp[tuple(self[k, source])] += [k]
            idx = 1
            D = {}
            for vals in tmp.values():
                for k in vals:
                    D[k] = idx
                idx += 1
            self.add_entries(target, D, lambda x: x, override=override)
        elif idtype == 'loose':

            D = {}
            idx = 1
            for c in self.rows:
                idxs = self.get_list(row=c, flat=True)
                srcs = [self[k, source] for k in idxs]

                # get connected components
                g = nx.Graph()
                g.add_nodes_from(idxs)
                for (i, cogsA), (j,
                                 cogsB) in util.combinations2(zip(idxs, srcs)):
                    if [x for x in cogsA if x in cogsB]:
                        g.add_edge(i, j)
                for i, comps in enumerate(nx.connected_components(g)):
                    for comp in comps:
                        D[comp] = idx + i
                idx += (i + 1)
            self.add_entries(target, D, lambda x: x, override=override)
        else:
            raise ValueError("The value you selected is not available.")
Esempio n. 3
0
    def add_cognate_ids(self, source, target, idtype='strict', override=False):
        """
        Compute normal cognate identifiers from partial cognate sets.

        Parameters
        ----------
        source: str
            Name of the source column in your wordlist file.
        target : str
            Name of the target column in your wordlist file.
        idtype : str (default="strict")
            Select between "strict" and "loose".
        override: bool (default=False)
            Specify whether you want to override existing columns.
        
        Notes
        -----
        While the computation of strict cognate IDs from partial cognate IDs is
        straightforward and just judges those words as cognate which are
        identical in all their parts, the computation of loose cognate IDs
        constructs a network between all words, draws lines between all words
        that share a common morpheme, and judges all connected components in this
        network as cognate.
        """
        if idtype == 'strict':
            
            tmp = defaultdict(list)
            for k in self._data:
                tmp[tuple(self[k, source])] += [k]
            idx = 1
            D = {}
            for vals in tmp.values():
                for k in vals:
                    D[k] = idx
                idx += 1
            self.add_entries(target, D, lambda x: x, override=override)
        elif idtype == 'loose':

            D = {}
            idx = 1
            for c in self.rows:
                idxs = self.get_list(row=c, flat=True)
                srcs = [self[k, source] for k in idxs]

                # get connected components
                g = nx.Graph()
                g.add_nodes_from(idxs)
                for (i, cogsA), (j, cogsB) in combinations2(zip(idxs, srcs)):
                     if [x for x in cogsA if x in cogsB]:
                         g.add_edge(i, j)
                for i,comps in enumerate(nx.connected_components(g)):
                    for comp in comps:
                        D[comp] = idx + i
                idx += (i+1)
            self.add_entries(target, D, lambda x: x, override=override)
        else:
            raise ValueError("The value you selected is not available.")
Esempio n. 4
0
def test_combinations2():
    def f(l):
        for i, a1 in enumerate(l):
            for j, a2 in enumerate(l):
                if i < j:
                    yield a1, a2

    def fm(l):
        for i, a1 in enumerate(l):
            for j, a2 in enumerate(l):
                if i <= j:
                    yield a1, a2

    for ch in [list(range(5)), 'abcdefg']:
        assert list(util.combinations2(ch)) == list(f(ch))
        assert list(util.multicombinations2(ch)) == list(fm(ch))
Esempio n. 5
0
    def test_combinations2(self):
        def f(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i < j:
                        yield a1, a2

        def fm(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i <= j:
                        yield a1, a2

        for l in [list(range(5)), 'abcdefg']:
            self.assertEqual(list(util.combinations2(l)), list(f(l)))
            self.assertEqual(list(util.multicombinations2(l)), list(fm(l)))
Esempio n. 6
0
    def test_combinations2(self):
        def f(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i < j:
                        yield a1, a2

        def fm(l):
            for i, a1 in enumerate(l):
                for j, a2 in enumerate(l):
                    if i <= j:
                        yield a1, a2

        for l in [list(range(5)), 'abcdefg']:
            self.assertEqual(list(util.combinations2(l)), list(f(l)))
            self.assertEqual(list(util.multicombinations2(l)), list(fm(l)))
Esempio n. 7
0
def _make_matrix(taxa, colex):
    """
    Take colexification data and use it to create a distance matrix.

    Notes
    -----
    "colex" is a dictionary with taxon names as keys and colexification data in
    form of tuples of concepts, not necessarily ordered, in both directions, as
    values.
    """
    # calculate the matrix
    matrix = [[0 for i in range(len(colex))] for j in range(len(colex))]
    for (i, t1), (j, t2) in combinations2(enumerate(taxa)):
        intersection = colex[t1].intersection(colex[t2])
        union = colex[t1].union(colex[t2])
        matrix[i][j] = matrix[j][i] = 1 - len(intersection) / len(union)
    return matrix
Esempio n. 8
0
def _make_matrix(taxa, colex):
    """
    Take colexification data and use it to create a distance matrix.

    Note
    ----
    "colex" is a dictionary with taxon names as keys and colexification data in
    form of tuples of concepts, not necessarily ordered, in both directions, as
    values.
    """
    # calculate the matrix
    matrix = [[0 for i in range(len(colex))] for j in range(len(colex))]
    for (i, t1), (j, t2) in combinations2(enumerate(taxa)):
        intersection = colex[t1].intersection(colex[t2])
        union = colex[t1].union(colex[t2])
        matrix[i][j] = matrix[j][i] = 1 - len(intersection) / len(union)
    return matrix
Esempio n. 9
0
def _get_wad(matrix, threshold, use_log=False):
    """
    Get weighted average degree.
    """
    def log_f(x):
        return -np.log(1 - x) if use_log else x

    degreeDict = defaultdict(list)

    for i, j in util.combinations2(range(len(matrix))):
        score = matrix[i][j]
        if score < threshold:
            deg = log_f(score)
            degreeDict[i].append(deg)
            degreeDict[j].append(deg)

    deg_sum = 0
    for weights in degreeDict.values():
        deg = sum(weights)
        deg_sum += deg

    if degreeDict:
        return deg_sum / len(degreeDict)
Esempio n. 10
0
def _get_colexifications(wordlist, entry='ipa', concept='concept', family='family'):
    """
    Helper function computes colexifications for a given set of languages in a
    wordlist.
    """
    if family not in wordlist.header:
        family = 'doculect'

    taxa = wordlist.cols
    colexifications = []
    for taxon in taxa:
        log.info('Analyzing taxon {0}...'.format(taxon))

        tmp_idxs = wordlist.get_list(taxon=taxon, flat=True)
        tmp_family = wordlist[tmp_idxs[0], family]
        tmp_concepts = wordlist.get_list(taxon=taxon, flat=True, entry=concept)
        tmp_entries = wordlist.get_list(taxon=taxon, flat=True, entry=entry)

        # iterate over all concepts and add them to the graph
        for (i, c1), (j, c2) in combinations2(enumerate(tmp_concepts)):
            if tmp_entries[i] == tmp_entries[j] and c1 != c2:
                colexifications += [(c1, c2, taxon, tmp_family, tmp_entries[i])]

    return colexifications
Esempio n. 11
0
    def partial_cluster(
            self,
            method='sca',
            threshold=0.45,
            scale=0.5,
            factor=0.3,
            restricted_chars='_T',
            mode='overlap',
            cluster_method='infomap',
            gop=-1,
            restriction='',
            ref='',
            external_function=None,
            split_on_tones=True,
            **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(
                imap_mode = True,
                post_processing = False,
                inflation=2,
                expansion=2,
                max_steps=1000,
                add_self_loops=True,
                sep=lingpy.settings.rcParams['morpheme_separator'],
                word_sep=lingpy.settings.rcParams['word_separator'],
                word_seps=lingpy.settings.rcParams['word_separators'],
                seps=lingpy.settings.rcParams['morpheme_separators'],
                mcl_logs=lambda x: -np.log2((1 - x) ** 2)
                )
        kw.update(keywords)        

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(method=method, scale=scale,
                factor=factor, restricted_chars=restricted_chars, mode=mode,
                gop=gop, imap_mode=kw['imap_mode'],
                split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list) # stores the pcogids
        G = {} # stores the graphs
        with pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold, matrix,
                            taxa=list(range(len(matrix))), revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                            matrix, taxa=list(range(len(matrix))), 
                            revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold, matrix, 
                            taxa = list(range(len(matrix))),
                            max_steps=kw['max_steps'],
                            inflation=kw['inflation'],
                            expansion=kw['expansion'],
                            add_self_loops=kw['add_self_loops'],
                            logs=kw['mcl_logs'],
                            revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                            threshold, matrix,
                            revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")
                
                for i,(idx,pos,slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i,(idx,pos,slc) in enumerate(trace):
                        _g.add_node((i,idx,pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in combinations2(enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i,row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                            remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i,coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j,idx,pos in coms:
                            C[idx][pos] = cogid
                    
                    G[concept] = _g

                k += max(c.values())
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
Esempio n. 12
0
    def partial_cluster(self,
                        method='sca',
                        threshold=0.45,
                        scale=0.5,
                        factor=0.3,
                        restricted_chars='_T',
                        mode='overlap',
                        cluster_method='infomap',
                        gop=-1,
                        restriction='',
                        ref='',
                        external_function=None,
                        split_on_tones=False,
                        **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(imap_mode=True,
                  post_processing=True,
                  inflation=2,
                  expansion=2,
                  max_steps=1000,
                  add_self_loops=True,
                  sep=lingpy.settings.rcParams['morpheme_separator'],
                  word_sep=lingpy.settings.rcParams['word_separator'],
                  word_seps=lingpy.settings.rcParams['word_separators'],
                  seps=lingpy.settings.rcParams['morpheme_separators'],
                  mcl_logs=lambda x: -np.log2((1 - x)**2))
        kw.update(keywords)

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(
            method=method,
            scale=scale,
            factor=factor,
            restricted_chars=restricted_chars,
            mode=mode,
            gop=gop,
            imap_mode=kw['imap_mode'],
            split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list)  # stores the pcogids
        G = {}  # stores the graphs
        with util.pb(desc='PARTIAL SEQUENCE CLUSTERING',
                     total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold,
                                          matrix,
                                          taxa=list(range(len(matrix))),
                                          revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                                                 matrix,
                                                 taxa=list(range(len(matrix))),
                                                 revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold,
                                       matrix,
                                       taxa=list(range(len(matrix))),
                                       max_steps=kw['max_steps'],
                                       inflation=kw['inflation'],
                                       expansion=kw['expansion'],
                                       add_self_loops=kw['add_self_loops'],
                                       logs=kw['mcl_logs'],
                                       revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                                                threshold,
                                                matrix,
                                                revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")

                for i, (idx, pos, slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i, (idx, pos, slc) in enumerate(trace):
                        _g.add_node((i, idx, pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in util.combinations2(
                            enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i, row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                    remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i, coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j, idx, pos in coms:
                            C[idx][pos] = cogid

                    G[concept] = _g

                k += len(matrix) + 1
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
Esempio n. 13
0
def partition_density(matrix, t):
    """
    Calculate partition density for a given threshold on a distance matrix.

    Notes
    -----
    See :evobib:`Ahn2012` for details on the calculation of partition density
    in a given network.
    """

    # compute cutoff for matrix at t
    m = np.zeros((len(matrix), len(matrix)))

    for i, j in util.combinations2(range(len(matrix))):
        if matrix[i][j] < t:
            m[j][i] = m[i][j] = 1

    # get the total number of links
    T = sum(m.flatten()) / 2

    # get connected components
    nodes = list(range(len(m)))
    idx = 1
    parts = [0 for i in range(len(m))]

    for i, j in util.combinations2(range(len(m))):
        if m[i][j] == 1:
            if parts[i] == parts[j] and parts[i] != 0:
                pass
            else:
                # most complicated, update all the stuff
                if parts[i] > 0 and parts[j] > 0:

                    # determine best idx
                    if parts[i] > parts[j]:
                        this = parts[j]
                        other = parts[i]
                    else:
                        this = parts[i]
                        other = parts[j]

                    # find all neighbors of the
                    idxs = [n for n in nodes if parts[n] == other]
                    for n in idxs:
                        parts[n] = this
                elif parts[i] == 0 and parts[j] == 0:
                    parts[i] = idx
                    parts[j] = idx
                    idx += 1
                elif parts[i] > 0:
                    parts[j] = parts[i]
                elif parts[j] > 0:
                    parts[i] = parts[j]

    # finish unconnected components
    for i, p in enumerate(parts):
        if p == 0:
            parts[i] = max(parts) + 1

    # convert to dictionary
    components = sorted(set(parts))

    # return zero, if all components are different
    if len(components) == len(m):
        return 0.0, len(components)

    # count density
    D = 0

    for part in components:
        # get nodes
        nodes = [n for n in range(len(parts)) if parts[n] == part]

        # get edges
        edges = 0
        for i, j in util.combinations2(range(len(nodes))):
            if m[nodes[i]][nodes[j]] == 1:
                edges += 1

        N = len(nodes)
        M = edges

        # calculate sum formula
        x = 1
        try:
            t = M * (M - (N - x)) / ((N - 1 + x) * (N - x))
            D += t
        except ZeroDivisionError:
            pass

    return 2 / T * D, len(components)
Esempio n. 14
0
def mcl(threshold,
        matrix,
        taxa,
        max_steps=1000,
        inflation=2,
        expansion=2,
        add_self_loops=True,
        revert=False,
        logs=True,
        matrix_type="distances"):
    """
    Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`).

    Parameters
    ----------
    threshold : {float, bool}
        The threshold that shall be used for the initial selection of links
        assigned to the data. If set to c{False}, the weights from the matrix
        will be used directly.

    matrix : list
        A two-dimensional list containing the distances.

    taxa : list
        An list containing the names of all taxa corresponding to the distances
        in the matrix.

    max_steps : int (default=1000)
        Maximal number of iterations.

    inflation : int (default=2)
        Inflation parameter for the MCL algorithm.

    expansion : int (default=2)
        Expansion parameter of the MCL algorithm.

    add_self_loops : {True, False, builtins.function} (default=True)
        Determine whether self-loops should be added, and if so, how they
        should be weighted. If a function for the calculation of self-loops is
        given, it will take the whole column of the matrix for each taxon as
        input.

    logs : { bool, function } (default=True)
        If set to c{True}, the logarithm of the score beyond the threshold will
        be assigned as weight to the graph. If set to c{False} all weights will
        be set to 1. Use a custom function to define individual ways to
        calculate the weights.

    matrix_type : { "distances", "similarities" }
        Specify the type of the matrix. If the matrix contains distance data,
        it will be adapted to similarity data. If it contains "similarities",
        no adaptation is needed.

    Examples
    --------

    The function is automatically imported along with LingPy.

    >>> from lingpy import *
    >>> from lingpy.algorithm import squareform

    Create a list of arbitrary taxa.

    >>> taxa = ['German','Swedish','Icelandic','English','Dutch']

    Create an arbitrary distance matrix.

    >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3])
    >>> matrix
    [[0.0, 0.5, 0.67, 0.8, 0.2],
     [0.5, 0.0, 0.4, 0.7, 0.6],
     [0.67, 0.4, 0.0, 0.8, 0.8],
     [0.8, 0.7, 0.8, 0.0, 0.3],
     [0.2, 0.6, 0.8, 0.3, 0.0]]

    Carry out the link-clustering analysis.

    >>> mcl(0.5,matrix,taxa)
    {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']}

    """
    # check for type of matrix
    if type(matrix) != np.ndarray:
        imatrix = np.array(matrix)
    else:
        imatrix = matrix.copy()

    # check for matrix type and decide how to handle logs
    if matrix_type == 'distances':
        evaluate = lambda x: True if x < threshold else False
        if logs == True:
            logs = lambda x: -np.log2((1 - x)**2)
        elif logs == False:
            logs = lambda x: x
    elif matrix_type == 'similarities':
        evaluate = lambda x: True if x > threshold else False
        if logs == True:
            logs = lambda x: -np.log(x**2)
        else:
            logs = lambda x: x
    else:
        raise ValueError(matrix_type)

    # check for threshold
    if threshold:
        for i, j in util.combinations2(range(len(imatrix))):
            score = imatrix[i][j]
            evaluation = logs(score) if evaluate(score) else 0
            imatrix[i][j] = evaluation
            imatrix[j][i] = evaluation

    # check for self_loops
    if add_self_loops == True:
        for i in range(len(imatrix)):
            imatrix[i][i] = 1
    elif add_self_loops == False:
        pass
    else:
        for i in range(len(imatrix)):
            imatrix[i][i] = add_self_loops(imatrix[:, i])

    # normalize the matrix
    imatrix = _normalize_matrix(imatrix)

    # start looping and the like
    steps = 0
    while True:
        # expansion
        imatrix = np.linalg.matrix_power(imatrix, expansion)

        # inflation
        imatrix = imatrix**inflation

        # normalization
        imatrix = _normalize_matrix(imatrix)

        # increase steps
        steps += 1

        # check for matrix convergence
        if steps >= max_steps or _is_idempotent(imatrix):
            log.debug("Number of steps {0}.".format(steps))
            break

    # retrieve the clusters
    clusters = _interprete_matrix(imatrix)

    # modify clusters
    if revert:
        return dict(zip(range(len(taxa)), clusters))

    clr = defaultdict(list)
    for i, t in enumerate(taxa):
        clr[clusters[i]].append(t)

    return clr
Esempio n. 15
0
def link_clustering(threshold,
                    matrix,
                    taxa,
                    link_threshold=False,
                    revert=False,
                    matrix_type="distances",
                    fuzzy=True):
    """
    Carry out a link clustering analysis using the method by :evobib:`Ahn2010`.

    Parameters
    ----------
    threshold : {float, bool}
        The threshold that shall be used for the initial selection of links
        assigned to the data. If set to c{False}, the weights from the matrix
        will be used directly.

    matrix : list
        A two-dimensional list containing the distances.

    taxa : list
        An list containing the names of all taxa corresponding to the distances
        in the matrix.

    link_threshold : float (default=0.5)
        The threshold that shall be used for the internal clustering of the
        data.

    matrix_type : {"distances","similarities","weights"} (default="distances")
        Specify the type of the matrix. If the matrix contains distance data,
        it will be adapted to similarity data. If it contains "similarities",
        no adaptation is needed. If it contains "weights", a weighted version
        of link clustering (see the supplementary in :evobib:`Ahn2010` for
        details) ]will be carried out.

    Returns
    -------
    cluster : dict
        A dictionary with cluster-IDs as keys and a list as value, containing
        the taxa that are assigned to a given cluster-ID.

    Examples
    --------

    The function is automatically imported along with LingPy.

    >>> from lingpy import *
    >>> from lingpy.algorithm import squareform

    Create a list of arbitrary taxa.

    >>> taxa = ['German','Swedish','Icelandic','English','Dutch']

    Create an arbitrary distance matrix.

    >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3])
    >>> matrix
    [[0.0, 0.5, 0.67, 0.8, 0.2],
     [0.5, 0.0, 0.4, 0.7, 0.6],
     [0.67, 0.4, 0.0, 0.8, 0.8],
     [0.8, 0.7, 0.8, 0.0, 0.3],
     [0.2, 0.6, 0.8, 0.3, 0.0]]

    Carry out the link-clustering analysis.

    >>> link_clustering(0.5,matrix,taxa)
    {1: ['Dutch', 'English', 'German'], 2: ['Icelandic', 'Swedish']}

    See also
    --------
    fuzzy

    """
    # check for matrix type
    if matrix_type == 'distances':
        evaluate = lambda x: x < threshold
    elif matrix_type == 'similarities':
        evaluate = lambda x: x > threshold
    elif matrix_type == 'weights':
        evaluate = lambda x: False
    else:
        raise ValueError(matrix_type)

    # get the edges and the adjacency from the thresholds
    edges = set()
    adjacency = dict([(t, set()) for t in taxa])
    weights = {}

    for i, j in util.combinations2(range(len(taxa))):
        taxA, taxB = taxa[i], taxa[j]
        if evaluate(matrix[i][j]):
            edges.add((taxA, taxB))
            adjacency[taxA].add(taxB)
            adjacency[taxB].add(taxA)
        elif matrix_type == 'weights':
            if matrix[i][j] < threshold:
                edges.add((taxA, taxB))
                adjacency[taxA].add(taxB)
                adjacency[taxB].add(taxA)
                edges.add((taxB, taxA))
                weights[taxA, taxB] = -np.log2((1 - matrix[i][j])**2)
                weights[taxB, taxA] = -np.log2((1 - matrix[i][j])**2)
    weights = weights or None

    if edges:
        # initialize the HLC object
        hlc = lc.HLC(adjacency, edges)
    else:
        # check for null edges: if they occur, return the clusters directly
        if revert:
            if fuzzy:
                return {a: [b] for a, b in zip(taxa, range(len(taxa)))}
            else:
                return {a: b for a, b in zip(taxa, range(len(taxa)))}
        else:
            if fuzzy:
                return {a: [b] for a, b in zip(range(len(taxa)), taxa)}
            else:
                return {a: b for a, b in zip(range(len(taxa)), taxa)}

    # carry out the analyses using defaults for the clustering
    edge2cid = hlc.single_linkage(threshold=link_threshold, w=weights)[0]

    # retrieve all clusterings for the nodes
    # retrieve the data
    clr2nodes = defaultdict(list)
    clr2edges = defaultdict(list)

    # count the links of
    for edge, idx in edge2cid.items():
        nodeA, nodeB = edge[0], edge[1]
        clr2edges[idx].append(edge)
        clr2nodes[idx].extend([nodeA, nodeB])

    for idx in clr2nodes:
        clr2nodes[idx] = sorted(set(clr2nodes[idx]))

    # delete all clusters that appear as subsets of larger clusters
    delis = set()
    for keyA, keyB in util.product2(sorted(clr2nodes)):
        if keyA != keyB:
            valsA = set(clr2nodes[keyA])
            valsB = set(clr2nodes[keyB])

            if valsA != valsB:
                if valsA.issubset(valsB):
                    delis.add(keyA)
                elif valsB.issubset(valsA):
                    delis.add(keyB)
            elif valsA == valsB:
                delis.add(keyB)
    for k in delis:
        del clr2nodes[k]

    # renumber the data
    mapper = dict(zip(clr2nodes.keys(), range(1, len(clr2nodes) + 1)))

    out = {}
    found = []
    for idx in clr2nodes:
        out[mapper[idx]] = clr2nodes[idx]
        found += clr2nodes[idx]
    missing = [f for f in taxa if f not in found]
    idx = max(out.keys()) + 1
    for m in missing:
        out[idx] = [m]
        idx += 1

    # determine weights for communities to edges
    node_weights = dict([(t, defaultdict(int)) for t in taxa])
    for c, e in clr2edges.items():
        for nA, nB in e:
            if c in mapper:
                this_c = mapper[c]
                node_weights[nA][this_c] += 1
                node_weights[nB][this_c] += 1

    # revert stuff first
    cluster = dict([(t, []) for t in taxa])
    for idx in out:
        for t in out[idx]:
            cluster[t] += [idx]

    # weight membership of nodes and assign to most prominent community
    if not fuzzy:
        new_cluster = {}
        for t, clr in cluster.items():
            weighted = sorted(clr,
                              key=lambda x: node_weights[t][x]
                              if x in node_weights[t] else 0,
                              reverse=True)
            new_cluster[t] = weighted[0]
        if revert:
            return {taxa.index(t): c for t, c in new_cluster.items()}

        out = {c: [] for c in set(new_cluster.values())}
        for t, c in new_cluster.items():
            out[c].append(t)
        return out

    if not revert:
        return out

    cluster = {t: [] for t in taxa}
    for idx in out:
        for t in out[idx]:
            cluster[t].append(idx)

    return cluster
Esempio n. 16
0
def fuzzy(threshold, matrix, taxa, method='upgma', revert=False):
    """
    Create fuzzy cluster of a given distance matrix.

    Parameters
    ----------
    threshold : float
        The threshold that shall be used for the basic clustering of the data.

    matrix : list
        A two-dimensional list containing the distances.

    taxa : list
        An list containing the names of all taxa corresponding to the distances
        in the matrix.

    method : { "upgma", "single", "complete" } (default="upgma")
        Select the method for the flat cluster analysis.

    distances : bool
        If set to "False", only the topology of the tree will be returned.

    revert : bool (default=False)
        Specify whether a reverted dictionary should be returned.

    Returns
    -------
    cluster : dict
        A dictionary with cluster-IDs as keys and a list as value, containing
        the taxa that are assigned to a given cluster-ID.

    Examples
    --------
    The function is automatically imported along with LingPy.

    >>> from lingpy import *
    from lingpy.algorithm import squareform

    Create a list of arbitrary taxa.

    >>> taxa = ['German','Swedish','Icelandic','English','Dutch']

    Create an arbitrary distance matrix.

    >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3])
    >>> matrix
    [[0.0, 0.5, 0.67, 0.8, 0.2],
     [0.5, 0.0, 0.4, 0.7, 0.6],
     [0.67, 0.4, 0.0, 0.8, 0.8],
     [0.8, 0.7, 0.8, 0.0, 0.3],
     [0.2, 0.6, 0.8, 0.3, 0.0]]

    Carry out the fuzzy flat cluster analysis.

    >>> fuzzy(0.5,matrix,taxa)
    {1: ['Swedish', 'Icelandic'], 2: ['Dutch', 'German'], 3: ['Dutch', 'English']}

    Notes
    -----
    This is a very simple fuzzy clustering algorithm. It basically does nothing
    else than removing taxa successively from the matrix, flat-clustering the
    remaining taxa with the corresponding threshold, and then returning a
    combined "consensus" cluster in which taxa may be assigned to multiple
    clusters.

    See also
    --------
    link_clustering

    """
    g = nx.Graph()

    for taxon in taxa:
        g.add_node(taxon)

    for idx, taxon in enumerate(taxa):
        new_matrix = []
        for i, line in enumerate(matrix):
            for j, cell in enumerate(line):
                if i < j and i != idx and j != idx:
                    new_matrix += [cell]
        new_matrix = misc.squareform(new_matrix)

        clusters = cluster.flat_cluster(method, threshold, new_matrix,
                                        [t for t in taxa if t != taxon])

        for clr in clusters:
            for tA, tB in util.combinations2(clusters[clr]):
                if not g.has_edge(tA, tB):
                    g.add_edge(tA, tB, weight=1)
                else:
                    g.edge[tA][tB]['weight'] += 1
    out = {i + 1: c for i, c in enumerate(nx.find_cliques(g))}

    if revert:
        new_out = defaultdict(list)
        for key, val in out.items():
            for v in val:
                new_out[v].append(key)
        return new_out

    return out