Esempio n. 1
0
def test_nwk2tree_matrix():
    newick = '(((a,b),(c,d)),e);'
    matrix, taxa = tree.nwk2tree_matrix(newick)
    assert taxa == Tree(newick).taxa
Esempio n. 2
0
    def test_nwk2tree_matrix(self):

        matrix, taxa = tree.nwk2tree_matrix(self.newick)
        assert taxa == self.tree.taxa
Esempio n. 3
0
def plot_heatmap(wordlist,
                 filename="heatmap",
                 fileformat="pdf",
                 ref='cogid',
                 normalized=False,
                 refB='',
                 **keywords):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False)
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False."
            )

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes([
            keywords['left'], keywords['bottom'], 0.25 * keywords['width'],
            keywords['height']
        ])
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',
        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=ref)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=ref)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=ref)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=ref)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=refB)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=refB)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=refB)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=refB)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref)
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes([
        left,  # keywords['left']+0.25 * keywords['width']+0.05,
        keywords['bottom'],
        keywords['width'],
        keywords['height']
    ])
    cmap = keywords['cmap']

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [[keywords['vmax'], keywords['vmin']],
               [keywords['vmin'], keywords['vmax']]]

    im = ax2.matshow(nmatrix,
                     aspect='auto',
                     origin='lower',
                     interpolation='nearest',
                     cmap=keywords['cmap'],
                     vmax=keywords['vmax'],
                     vmin=keywords['vmin'])

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])

    plt.xticks(idxs,
               selected_taxa,
               size=keywords['textsize'],
               rotation=keywords['xrotation'],
               rotation_mode="default")
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix,
                   cmap=keywords['cmap'],
                   visible=False,
                   vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"],
                    size=keywords['colorbar_textsize'])

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Esempio n. 4
0
def plot_heatmap(
    wordlist,
    filename="heatmap",
    fileformat="pdf",
    ref='cogid',
    normalized=False,
    refB='',
    **keywords
):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False
    )
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False.")

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes(
            [
                keywords['left'],
                keywords['bottom'],
                0.25 * keywords['width'],
                keywords['height']
            ]
        )
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',

        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=ref
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=ref
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=ref
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=ref
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=refB
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=refB
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=refB
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=refB
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(
                        taxa=taxonA,
                        flat=True,
                        entry=ref
                    )
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes(
        [
            left,  # keywords['left']+0.25 * keywords['width']+0.05,
            keywords['bottom'],
            keywords['width'],
            keywords['height']
        ]
    )
    cmap = keywords['cmap'] 

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [
            [keywords['vmax'], keywords['vmin']],
            [keywords['vmin'], keywords['vmax']]
            ]

    im = ax2.matshow(nmatrix, aspect='auto', origin='lower',
                     interpolation='nearest', cmap=keywords['cmap'],
                     vmax=keywords['vmax'], vmin=keywords['vmin']
                     )

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])



    plt.xticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
        rotation=keywords['xrotation'],
        rotation_mode="default"
    )
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Esempio n. 5
0
 def test_nwk2tree_matrix(self):
     
     matrix, taxa = tree.nwk2tree_matrix(self.newick)
     assert taxa == self.tree.taxa