Ejemplo n.º 1
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Ejemplo n.º 2
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Ejemplo n.º 3
0
def colexification_network(
        wordlist,
        entry='ipa',
        concept='concept',
        output='',
        filename='network',
        bipartite=False,
        **keywords):
    """
    Calculate a colexification network from a given wordlist object.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        The wordlist object containing the data.

    entry : str (default="ipa")
        The reference point for the language entry. We use "ipa" as a default.
    concept : str (default="concept")
        The reference point for the name of the row containing the concepts. We
        use "concept" as a default.
    output: str (default='')
        If output is set to "gml", the resulting network will be written to a
        textfile in GML format.

    Returns
    -------
    G : networkx.Graph
        A networkx.Graph object.

    """
    # now, iterate over all concepts for each taxon and add the connections to
    # our network, which we now simply store as networkx graph for conveniency
    colexifications = _get_colexifications(wordlist, entry, concept)
    stats = _get_statistics(wordlist, entry, concept)

    G = _make_graph(colexifications, bipartite=bipartite)

    # we should also add meta-data to the nodes in the graph
    for node, data in G.nodes(data=True):
        if data['ntype'] == 'concept':
            data.update(stats[node])

    if not output:
        return G

    def stringify_data(data):
        for k in data:
            if isinstance(data[k], list):
                data[k] = join('//', *data[k])

    if output == 'gml':
        for node, data in G.nodes(data=True):
            stringify_data(data)
        for nA, nB, data in G.edges(data=True):
            stringify_data(data)
        nx.write_gml(G, filename + '.gml')
        log.file_written(filename + '.gml')
Ejemplo n.º 4
0
def colexification_network(wordlist,
                           entry='ipa',
                           concept='concept',
                           output='',
                           filename='network',
                           bipartite=False,
                           **keywords):
    """
    Calculate a colexification network from a given wordlist object.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        The wordlist object containing the data.

    entry : str (default="ipa")
        The reference point for the language entry. We use "ipa" as a default.
    concept : str (default="concept")
        The reference point for the name of the row containing the concepts. We
        use "concept" as a default.
    output: str (default='')
        If output is set to "gml", the resulting network will be written to a
        textfile in GML format.

    Returns
    -------
    G : networkx.Graph
        A networkx.Graph object.

    """
    # now, iterate over all concepts for each taxon and add the connections to
    # our network, which we now simply store as networkx graph for conveniency
    colexifications = _get_colexifications(wordlist, entry, concept)
    stats = _get_statistics(wordlist, entry, concept)

    G = _make_graph(colexifications, bipartite=bipartite)

    # we should also add meta-data to the nodes in the graph
    for node, data in G.nodes(data=True):
        if data['ntype'] == 'concept':
            data.update(stats[node])

    if not output:
        return G

    def stringify_data(data):
        for k in data:
            if isinstance(data[k], list):
                data[k] = join('//', *data[k])

    if output == 'gml':
        for node, data in G.nodes(data=True):
            stringify_data(data)
        for nA, nB, data in G.edges(data=True):
            stringify_data(data)
        nx.write_gml(G, filename + '.gml')
        log.file_written(filename + '.gml')
Ejemplo n.º 5
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Ejemplo n.º 6
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Ejemplo n.º 7
0
def write_text_file(path, content, normalize=None, log=True):
    """Write a text file encoded in utf-8.

    :param path: File-system path of the file.
    :content: The text content to be written.
    :param normalize: If not `None` a valid unicode normalization mode must be passed.
    """
    if not isinstance(content, text_type):
        content = lines_to_text(content)
    with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp:
        fp.write(unicodedata.normalize(normalize, content) if normalize else content)
    if log:
        file_written(_str_path(path))
Ejemplo n.º 8
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a,
                b) in enumerate(zip(self.gold.alignments,
                                    self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append(
                    '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.
                    format(
                        seq_id,
                        taxA,
                        '\t'.join(g1),
                        taxB,
                        '\t'.join(g2),
                        '{0}\t{1}'.format(
                            taxlen * ' ',
                            '\t'.join(['==' for x in range(maxL)])),
                        '\t'.join(t1),
                        '\t'.join(t2),
                    ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Ejemplo n.º 9
0
def write_text_file(path, content, normalize=None, log=True):
    """Write a text file encoded in utf-8.

    Parameters
    ----------
    path : str
        File-system path of the file.
    content : str
        The text content to be written.
    normalize : { None, "NFC", "NFD" } (default=False)
        If not `None` a valid unicode normalization mode must be passed.
    log : bool (default=True)
        Indicate whether you want to log the result of the file writing
        process.

    """
    if not isinstance(content, text_type):
        content = lines_to_text(content)
    with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp:
        fp.write(unicodedata.normalize(normalize, content) if normalize else content)
    if log:
        file_written(_str_path(path))
Ejemplo n.º 10
0
    def diff(self, **keywords):
        """
        Write all differences between two sets to a file.

        Parameters
        ----------

        filename : str (default='eval_psa_diff')
            Default

        """
        setdefaults(keywords, filename=self.gold.infile)
        if not keywords['filename'].endswith('.diff'):
            keywords['filename'] = keywords['filename'] + '.diff'

        out = []
        for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)):
            g1, g2, g3 = a
            t1, t2, t3 = b
            maxL = max([len(g1), len(t1)])
            if g1 != t1 or g2 != t2:
                taxA, taxB = self.gold.taxa[i]
                taxlen = max(len(taxA), len(taxB))
                seq_id = self.gold.seq_ids[i]
                out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format(
                    seq_id,
                    taxA,
                    '\t'.join(g1),
                    taxB,
                    '\t'.join(g2),
                    '{0}\t{1}'.format(
                        taxlen * ' ', '\t'.join(['==' for x in range(maxL)])),
                    '\t'.join(t1),
                    '\t'.join(t2),
                ))
        log.file_written(keywords['filename'])
        write_text_file(keywords['filename'], out)
Ejemplo n.º 11
0
def write_text_file(path, content, normalize=None, log=True):
    """Write a text file encoded in utf-8.

    Parameters
    ----------
    path : str
        File-system path of the file.
    content : str
        The text content to be written.
    normalize : { None, "NFC", "NFD" } (default=False)
        If not `None` a valid unicode normalization mode must be passed.
    log : bool (default=True)
        Indicate whether you want to log the result of the file writing
        process.

    """
    if not isinstance(content, text_type):
        content = lines_to_text(content)
    with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp:
        fp.write(
            unicodedata.normalize(normalize, content) if normalize else content
        )
    if log:
        file_written(_str_path(path))
Ejemplo n.º 12
0
def plot_gls(gls, treestring, degree=90, fileformat='pdf', **keywords):
    """
    Plot a gain-loss scenario for a given reference tree.
    """

    # get kewyords
    defaults = dict(figsize=(15, 15),
                    left=0.05,
                    top=0.95,
                    bottom=0.05,
                    right=0.95,
                    radius=0.5,
                    textsize=8,
                    edgewidth=5,
                    linewidth=2,
                    scale_radius=1.2,
                    ylim=1,
                    xlim=1,
                    text=True,
                    gain_color='white',
                    loss_color='black',
                    gain_linestyle='dotted',
                    loss_linestyle='solid',
                    ax_linewidth=0,
                    filename=rcParams['filename'])

    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # set filename as variabel for convenience
    filename = keywords['filename']

    try:
        tree = cg.LoadTree(treestring=treestring)
    except:
        try:
            tree = cg.LoadTree(treestring)
        except:
            tree = treestring

    tgraph = radial_layout(treestring, degree=degree)

    graph = gls2gml(gls, tgraph, tree)

    nodes = []

    # assign nodes and edges
    for n, d in graph.nodes(data=True):
        g = d['graphics']
        x = g['x']
        y = g['y']
        s = d['state']

        nodes += [(x, y, s)]

    # now plot the stuff
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    # set the axes linewidht
    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    plt.axis('equal')

    for nA, nB in graph.edges():
        xA = graph.node[nA]['graphics']['x']
        xB = graph.node[nB]['graphics']['x']
        yA = graph.node[nA]['graphics']['y']
        yB = graph.node[nB]['graphics']['y']

        plt.plot([xA, xB], [yA, yB],
                 '-',
                 color='black',
                 linewidth=keywords['edgewidth'],
                 zorder=1)

    # now, iterate over nodes
    for x, y, s in nodes:
        if s == 'O':
            w = mpl.patches.Wedge((x, y),
                                  keywords['radius'],
                                  0,
                                  360,
                                  facecolor=keywords['gain_color'],
                                  linewidth=keywords['linewidth'],
                                  linestyle=keywords['gain_linestyle'])
        elif s == 'o':
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0,
                360,
                facecolor=keywords['gain_color'],
                linewidth=keywords['linewidth'])
        elif s == 'L':
            w = mpl.patches.Wedge((x, y),
                                  keywords['radius'],
                                  0,
                                  360,
                                  facecolor=keywords['loss_color'],
                                  linewidth=keywords['linewidth'],
                                  linestyle=keywords['loss_linestyle'])
        else:
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0,
                360,
                facecolor=keywords['loss_color'],
                linewidth=keywords['linewidth'])
        figsp.add_artist(w)

        # if text is chosen as argument
        if keywords['text']:
            if s in 'Oo':
                t = '1'
                c = 'black'
            else:
                t = '0'
                c = 'white'

            plt.text(x,
                     y,
                     t,
                     size=keywords['textsize'],
                     color=c,
                     va="center",
                     ha="center",
                     fontweight='bold')

    # set x and y-values
    xvals = [x[0] for x in nodes]
    yvals = [x[1] for x in nodes]

    plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim'])
    plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim'])

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])
    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 13
0
def plot_heatmap(wordlist,
                 filename="heatmap",
                 fileformat="pdf",
                 ref='cogid',
                 normalized=False,
                 refB='',
                 **keywords):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False)
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False."
            )

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes([
            keywords['left'], keywords['bottom'], 0.25 * keywords['width'],
            keywords['height']
        ])
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',
        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=ref)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=ref)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=ref)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=ref)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(taxa=taxonA,
                                                  flat=True,
                                                  entry=refB)
                        cogsB = wordlist.get_list(taxa=taxonB,
                                                  flat=True,
                                                  entry=refB)

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(taxa=taxonA, entry=refB)
                        cogsB = wordlist.get_dict(taxa=taxonB, entry=refB)

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(
                                str([
                                    shared, slots,
                                    len(cogsA),
                                    len(cogsB), taxonA, taxonB
                                ]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(taxa=taxonA, flat=True, entry=ref)
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes([
        left,  # keywords['left']+0.25 * keywords['width']+0.05,
        keywords['bottom'],
        keywords['width'],
        keywords['height']
    ])
    cmap = keywords['cmap']

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [[keywords['vmax'], keywords['vmin']],
               [keywords['vmin'], keywords['vmax']]]

    im = ax2.matshow(nmatrix,
                     aspect='auto',
                     origin='lower',
                     interpolation='nearest',
                     cmap=keywords['cmap'],
                     vmax=keywords['vmax'],
                     vmin=keywords['vmin'])

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])

    plt.xticks(idxs,
               selected_taxa,
               size=keywords['textsize'],
               rotation=keywords['xrotation'],
               rotation_mode="default")
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix,
                   cmap=keywords['cmap'],
                   visible=False,
                   vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"],
                    size=keywords['colorbar_textsize'])

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 14
0
def plot_tree(
    treestring,
    degree=90,
    fileformat='pdf',
    root="root",
    **keywords
):
    """
    Plot a Newick tree to PDF or other graphical formats.

    Parameters
    ----------
    treestring : str
        A string in Newick format.
    degree : int
        Determine the degree of the tree (this determines how "circular" the
        tree will be).
    fileformat : str (default="pdf")
        Select the fileformat to which the tree shall be written.
    filename : str
        Determine the name of the file to which the data shall be written.
        Defaults to a timestamp.
    figsize : tuple (default=(10,10))
        Determine the size of the figure.
    """

    default = dict(
        ax_linewidth=0,
        bg='black',
        bottom=0.05,
        change=lambda x: x ** 1.75,
        edge_list=[],
        figsize=(10, 10),
        filename=rcParams['filename'],
        fontweight='bold',
        frameon=False,
        ha='center',
        labels=[],
        left=0.05,
        linecolor='black',
        linewidth=5,
        no_labels=False,
        node_dict={},
        nodecolor='black',
        nodesize=10,
        right=0.95,
        start=0,
        textcolor='white',
        textsize='10',
        top=0.95,
        usetex=False,
        va='center',
        xlim=5,
        xliml=False,
        xlimr=False,
        ylim=5,
        ylimb=False,
        ylimt=False,
        rotation_mode='anchor',
        latex_preamble=False,
    )
    for k in default:
        if k not in keywords:
            keywords[k] = default[k]

    # set filename as variable for convenience
    filename = keywords['filename']

    # switch backend, depending on whether tex is used or not
    backend = mpl.get_backend()
    if keywords['usetex'] and backend != 'pgf':
        plt.switch_backend('pgf')
        mpl.rcParams['text.latex.unicode'] = True
    elif not keywords['usetex'] and backend != 'TkAgg':
        plt.switch_backend('TkAgg')

    if keywords['latex_preamble']:
        mpl.rcParams['pgf.preamble'] = keywords['latex_preamble']

    # get the tree-graph
    graph = radial_layout(
        treestring,
        degree=degree,
        change=keywords['change'],
        start=keywords['start'],
        root=root
    )

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    # plt.axes(frameon=keywords['frameon'])
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])

    # get xlim and ylim
    xvals, yvals = [], []
    # start iterating over edges
    for nA, nB, d in list(graph.edges(data=True)) + keywords['edge_list']:

        # get the coordinates
        xA = graph.node[nA]['graphics']['x']
        yA = graph.node[nA]['graphics']['y']
        xB = graph.node[nB]['graphics']['x']
        yB = graph.node[nB]['graphics']['y']

        if 'color' in d:
            plt.plot(
                [xA, xB],
                [yA, yB],
                '-',
                **d
            )
        else:
            plt.plot(
                [xA, xB],
                [yA, yB],
                '-',
                color=keywords['linecolor'],
                linewidth=keywords['linewidth'],
            )

    # get the nodes
    for n, d in graph.nodes(data=True):

        g = d['graphics']
        x, y = g['x'], g['y']

        xvals += [x]
        yvals += [y]

        # try to get information from the node-dict
        try:
            settings = {}
            settings.update(keywords['node_dict'][n])
        except:
            settings = {}

        # overwrite the stuff in keywords
        for k in keywords:
            if k not in settings:
                settings[k] = keywords[k]

        if d['label'].startswith('edge') \
                or d['label'].startswith(root) or keywords['no_labels']:
            plt.plot(
                x,
                y,
                'o',
                markersize=settings['nodesize'],
                color=settings['nodecolor'],
                markeredgewidth=settings['linewidth']
            )
        else:
            try:
                label = keywords['labels'][d['label']]
            except:
                label = d['label']
            if 'rotation' in settings:
                r = settings['rotation']
            else:
                r = g['angle']
            plt.text(
                x,
                y,
                label,
                # d['label'],
                color=settings['textcolor'],
                fontweight=settings['fontweight'],
                va=settings['va'],
                ha=g['s'],
                bbox=dict(
                    facecolor=settings['bg'],
                    boxstyle='square,pad=0.2',
                    ec="none",
                ),
                size=settings['textsize'],
                rotation=r,  # g['angle'],
                rotation_mode=settings['rotation_mode']
            )

    # set up the xlimits
    if not keywords['xlimr'] and not keywords['xliml']:
        xl, xr = 2 * [keywords['xlim']]
    else:
        xl, xr = keywords['xliml'], keywords['xlimr']

    # set up the xlimits
    if not keywords['ylimt'] and not keywords['ylimb']:
        yb, yt = 2 * [keywords['ylim']]
    else:
        yb, yt = keywords['ylimb'], keywords['ylimt']

    plt.xlim((min(xvals) - xl, max(xvals) + xr))
    plt.ylim((min(yvals) - yb, max(yvals) + yt))

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )

    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 15
0
def diff(
        lex,
        gold='cogid',
        test='lexstatid',
        loans=False,
        pprint=True,
        filename='',
        tofile=True,
        fuzzy=False):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    loans : bool (default=True)
        If set to c{False}, loans (indicated by negative IDs in the gold
        standard) will be treated as separate cognates, otherwise, loans will
        be treated as cognates.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    .. This function also calculates the "transformation" score. This score is
    .. based on the calculation of steps that are needed to transform one cluster
    .. for one set of meanings into the other. Ideally, if there are *n* different
    .. cognate sets covering one gloss in the gold standard, the minimal length of
    .. a mapping to convert the *m* cognate sets of the test set into the gold standard
    .. is *n*. In this case, both gold standard and test set are identical.
    .. However, if gold standard and test set differ, the number of mappings
    .. necessarily exceeds *m* and *n*. Based on this, the transformation
    .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of
    .. distinct clusters in the test set and *M* is the length of the mapping.
    .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the
    .. number of clusters in the gold standard.

    .. Note that if precision is lower than 1.0, this means there are false
    .. positive decisions in the test set. Accordingly, a recall lower than 1.0
    .. indicates that there are false negative decisions in the test set.
    .. The drawback of this score is that it is not sensitive regarding the
    .. distinct number of decisions in which gold standard and test set differ, so
    .. the recall can be very low although most of the words have been grouped
    .. accurately. The advantage is that it can be directly interpreted in terms
    .. of 'false positive/false negative' decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or lex.filename
    loan = abs if loans else identity

    # open file
    if tofile:
        f = codecs.open(filename + '.diff', 'w', 'utf-8')

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in lex.cols])) + '}'
    
    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_cogs(ref, bidx):
        cogs = lex.get_list(row=concept, entry=ref, flat=True)
        if fuzzy:
            cogs = [i[0] for i in cogs]

        tmp = {}
        for a, b in zip(cogs, bidx):
            if loan(a) not in tmp:
                tmp[loan(a)] = b
        return [tmp[loan(i)] for i in cogs]

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    def get_bcubed_score(one, other):
        tmp = defaultdict(list)
        for x, y in zip(one, other):
            tmp[x].append(y)
        bcp = 0.0
        for x in tmp:
            for y in tmp[x]:
                bcp += tmp[x].count(y) / len(tmp[x])
        return bcp / len(idxs)

    for concept in lex.concepts:
        idxs = lex.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = get_cogs(gold, bidx)
        cogsT = get_cogs(test, bidx)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0)
            recP.append(len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            if tofile:
                f.write(
                    "Concept: {0}, False Positives: {1}, False Negatives: {2}\n".format(
                        concept, fp, fn))

            # get the words
            words = [lex[i, 'ipa'] for i in idxs]
            langs = [lex[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            if tofile:
                for word, lang, cG, cT in sorted(
                        zip(words, langs, cogsG, cogsT),
                        key=lambda x: (x[2], x[3])):
                    f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format(
                        lform.format(lang), wform.format(word), cG, cT))
                f.write('#\n')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)

    if pprint:
        print('**************************')
        print('* B-Cubed-Scores         *')
        print('* ---------------------- *')
        print('* B-C.-Precision: {0:.4f} *'.format(bp))
        print('* B-C.-Recall:    {0:.4f} *'.format(br))
        print('* B-C.-F-Scores:  {0:.4f} *'.format(bf))
        print('**************************')
        print('')
        print('**************************')
        print('* Pair-Scores            *')
        print('* ---------------------- *')
        print('* Pair-Precision: {0:.4f} *'.format(pp))
        print('* Pair-Recall:    {0:.4f} *'.format(pr))
        print('* Pair-F-Scores:  {0:.4f} *'.format(pf))
        print('**************************')
    
    if tofile:
        f.write('B-Cubed Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(bp))
        f.write('Recall:    {0:.4f}\n'.format(br))
        f.write('F-Score:   {0:.4f}\n'.format(bf))
        f.write('#\n')
        f.write('Pair Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(pp))
        f.write('Recall:    {0:.4f}\n'.format(pr))
        f.write('F-Score:   {0:.4f}\n'.format(pf))
        f.close()
        log.file_written(filename + '.diff')
    else:
        return (bp, br, bf), (pp, pr, pf)
Ejemplo n.º 16
0
 def __exit__(self, type, value, traceback):
     self.fp.close()
     if self.log:
         file_written(_str_path(self.path))
Ejemplo n.º 17
0
def plot_gls(
    gls,
    treestring,
    degree=90,
    fileformat='pdf',
    **keywords
):
    """
    Plot a gain-loss scenario for a given reference tree.
    """

    # get kewyords
    defaults = dict(
        figsize=(15, 15),
        left=0.05,
        top=0.95,
        bottom=0.05,
        right=0.95,
        radius=0.5,
        textsize=8,
        edgewidth=5,
        linewidth=2,
        scale_radius=1.2,
        ylim=1,
        xlim=1,
        text=True,
        gain_color='white',
        loss_color='black',
        gain_linestyle='dotted',
        loss_linestyle='solid',
        ax_linewidth=0,
        filename=rcParams['filename']
    )

    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # set filename as variabel for convenience
    filename = keywords['filename']

    try:
        tree = cg.LoadTree(treestring=treestring)
    except:
        try:
            tree = cg.LoadTree(treestring)
        except:
            tree = treestring

    tgraph = radial_layout(treestring, degree=degree)

    graph = gls2gml(
        gls,
        tgraph,
        tree
    )

    nodes = []

    # assign nodes and edges
    for n, d in graph.nodes(data=True):
        g = d['graphics']
        x = g['x']
        y = g['y']
        s = d['state']

        nodes += [(x, y, s)]

    # now plot the stuff
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    # set the axes linewidht
    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    plt.axis('equal')

    for nA, nB in graph.edges():
        xA = graph.node[nA]['graphics']['x']
        xB = graph.node[nB]['graphics']['x']
        yA = graph.node[nA]['graphics']['y']
        yB = graph.node[nB]['graphics']['y']

        plt.plot(
            [xA, xB],
            [yA, yB],
            '-',
            color='black',
            linewidth=keywords['edgewidth'],
            zorder=1
        )

    # now, iterate over nodes
    for x, y, s in nodes:
        if s == 'O':
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'],
                0, 360,
                facecolor=keywords['gain_color'],
                linewidth=keywords['linewidth'],
                linestyle=keywords['gain_linestyle']
            )
        elif s == 'o':
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0, 360,
                facecolor=keywords['gain_color'],
                linewidth=keywords['linewidth']
            )
        elif s == 'L':
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'],
                0, 360,
                facecolor=keywords['loss_color'],
                linewidth=keywords['linewidth'],
                linestyle=keywords['loss_linestyle']
            )
        else:
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0, 360,
                facecolor=keywords['loss_color'],
                linewidth=keywords['linewidth']
            )
        figsp.add_artist(w)

        # if text is chosen as argument
        if keywords['text']:
            if s in 'Oo':
                t = '1'
                c = 'black'
            else:
                t = '0'
                c = 'white'

            plt.text(
                x,
                y,
                t,
                size=keywords['textsize'],
                color=c,
                va="center",
                ha="center",
                fontweight='bold'
            )

    # set x and y-values
    xvals = [x[0] for x in nodes]
    yvals = [x[1] for x in nodes]

    plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim'])
    plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(
        filename + '.' + fileformat
    )
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 18
0
 def __exit__(self, type, value, traceback):
     self.fp.close()
     if self.log:
         file_written(_str_path(self.path))
Ejemplo n.º 19
0
def plot_tree(treestring,
              degree=90,
              fileformat='pdf',
              root="root",
              **keywords):
    """
    Plot a Newick tree to PDF or other graphical formats.

    Parameters
    ----------
    treestring : str
        A string in Newick format.
    degree : int
        Determine the degree of the tree (this determines how "circular" the
        tree will be).
    fileformat : str (default="pdf")
        Select the fileformat to which the tree shall be written.
    filename : str
        Determine the name of the file to which the data shall be written.
        Defaults to a timestamp.
    figsize : tuple (default=(10,10))
        Determine the size of the figure.
    """

    default = dict(
        ax_linewidth=0,
        bg='black',
        bottom=0.05,
        change=lambda x: x**1.75,
        edge_list=[],
        figsize=(10, 10),
        filename=rcParams['filename'],
        fontweight='bold',
        frameon=False,
        ha='center',
        labels=[],
        left=0.05,
        linecolor='black',
        linewidth=5,
        no_labels=False,
        node_dict={},
        nodecolor='black',
        nodesize=10,
        right=0.95,
        start=0,
        textcolor='white',
        textsize='10',
        top=0.95,
        usetex=False,
        va='center',
        xlim=5,
        xliml=False,
        xlimr=False,
        ylim=5,
        ylimb=False,
        ylimt=False,
        rotation_mode='anchor',
        latex_preamble=False,
    )
    for k in default:
        if k not in keywords:
            keywords[k] = default[k]

    # set filename as variable for convenience
    filename = keywords['filename']

    # switch backend, depending on whether tex is used or not
    backend = mpl.get_backend()
    if keywords['usetex'] and backend != 'pgf':
        plt.switch_backend('pgf')
        mpl.rcParams['text.latex.unicode'] = True
    elif not keywords['usetex'] and backend != 'TkAgg':
        plt.switch_backend('TkAgg')

    if keywords['latex_preamble']:
        mpl.rcParams['pgf.preamble'] = keywords['latex_preamble']

    # get the tree-graph
    graph = radial_layout(treestring,
                          degree=degree,
                          change=keywords['change'],
                          start=keywords['start'],
                          root=root)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    # plt.axes(frameon=keywords['frameon'])
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])

    # get xlim and ylim
    xvals, yvals = [], []
    # start iterating over edges
    for nA, nB, d in list(graph.edges(data=True)) + keywords['edge_list']:

        # get the coordinates
        xA = graph.node[nA]['graphics']['x']
        yA = graph.node[nA]['graphics']['y']
        xB = graph.node[nB]['graphics']['x']
        yB = graph.node[nB]['graphics']['y']

        if 'color' in d:
            plt.plot([xA, xB], [yA, yB], '-', **d)
        else:
            plt.plot(
                [xA, xB],
                [yA, yB],
                '-',
                color=keywords['linecolor'],
                linewidth=keywords['linewidth'],
            )

    # get the nodes
    for n, d in graph.nodes(data=True):

        g = d['graphics']
        x, y = g['x'], g['y']

        xvals += [x]
        yvals += [y]

        # try to get information from the node-dict
        try:
            settings = {}
            settings.update(keywords['node_dict'][n])
        except:
            settings = {}

        # overwrite the stuff in keywords
        for k in keywords:
            if k not in settings:
                settings[k] = keywords[k]

        if d['label'].startswith('edge') \
                or d['label'].startswith(root) or keywords['no_labels']:
            plt.plot(x,
                     y,
                     'o',
                     markersize=settings['nodesize'],
                     color=settings['nodecolor'],
                     markeredgewidth=settings['linewidth'])
        else:
            try:
                label = keywords['labels'][d['label']]
            except:
                label = d['label']
            if 'rotation' in settings:
                r = settings['rotation']
            else:
                r = g['angle']
            plt.text(
                x,
                y,
                label,
                # d['label'],
                color=settings['textcolor'],
                fontweight=settings['fontweight'],
                va=settings['va'],
                ha=g['s'],
                bbox=dict(
                    facecolor=settings['bg'],
                    boxstyle='square,pad=0.2',
                    ec="none",
                ),
                size=settings['textsize'],
                rotation=r,  # g['angle'],
                rotation_mode=settings['rotation_mode'])

    # set up the xlimits
    if not keywords['xlimr'] and not keywords['xliml']:
        xl, xr = 2 * [keywords['xlim']]
    else:
        xl, xr = keywords['xliml'], keywords['xlimr']

    # set up the xlimits
    if not keywords['ylimt'] and not keywords['ylimb']:
        yb, yt = 2 * [keywords['ylim']]
    else:
        yb, yt = keywords['ylimb'], keywords['ylimt']

    plt.xlim((min(xvals) - xl, max(xvals) + xr))
    plt.ylim((min(yvals) - yb, max(yvals) + yt))

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])

    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 20
0
def plot_concept_evolution(scenarios,
                           tree,
                           fileformat='pdf',
                           degree=90,
                           **keywords):
    """
    Plot the evolution according to the MLN method of all words for a given concept.
    
    Parameters
    ----------
    tree : str
        A tree representation in Newick format.
    fileformat : str (default="pdf")
        A valid fileformat according to Matplotlib.
    degree : int (default=90)
        The degree by which the tree is drawn. 360 yields a circular tree, 180
        yields a tree filling half of the space of a circle.
    """

    # make defaults
    defaults = dict(
        figsize=(15, 15),
        left=0.05,
        top=0.95,
        bottom=0.05,
        right=0.95,
        colormap=mpl.cm.jet,
        edgewidth=5,
        radius=2.5,
        outer_radius=0.5,
        inner_radius=0.25,
        cognates='',
        usetex=False,
        latex_preamble=False,
        textsize=8,
        change=lambda x: x**1.75,
        xlim=0,
        ylim=0,
        xlimr=False,
        xliml=False,
        ylimt=False,
        ylimb=False,
        rootsize=10,
        legend=True,
        legendsize=5,
        legendAloc='upper right',
        legendBloc='lower right',
        markeredgewidth=2.5,
        wedgeedgewidth=2,
        gain_linestyle='dotted',
        loss_linestyle='solid',
        ax_linewidth=0,
        labels={},
        _prefix='-   ',
        _suffix='   -',
        colors={},
        start=0,
        filename=rcParams['filename'],
        loss_alpha=0.1,
        loss_background='0.75',
        edges=[],
        hedge_color="black",
        hedge_width=5,
        hedge_linestyle='dashed',
    )
    keywords.update(defaults)

    # set filename as variable for convenience
    filename = keywords['filename']

    # XXX customize later XXX
    colormap = keywords['colormap']

    # switch backend, depending on whether tex is used or not
    backend = mpl.get_backend()
    if keywords['usetex'] and backend != 'pgf':
        plt.switch_backend('pgf')
    elif not keywords['usetex'] and backend != 'TkAgg':
        plt.switch_backend('TkAgg')

    # check for preamble settings
    if keywords['latex_preamble']:
        mpl.rcParams['pgf.preamble'] = keywords['latex_preamble']

    # make a graph
    graph = nx.Graph()

    # get the tgraph
    tgraph = radial_layout(tree,
                           degree=degree,
                           change=keywords['change'],
                           start=keywords['start'])

    # get the taxa
    taxa = [n[0] for n in tgraph.nodes(data=True) if n[1]['tip']]

    # set the labels
    labels = {}
    for taxon in taxa:
        if taxon in keywords['labels']:
            labels[taxon] = keywords['labels'][taxon]
        else:
            labels[taxon] = taxon

    # get the number of paps in order to get the right colors
    cfunc = np.array(np.linspace(10, 256, len(scenarios)), dtype='int')

    if not keywords['colors']:
        colors = {
            scenarios[i][0]: mpl.colors.rgb2hex(colormap(cfunc[i]))
            for i in range(len(scenarios))
        }
    else:
        colors = keywords['colors']

    # get the wedges for the paps
    wedges = {}
    linsp = np.linspace(0, 360, len(scenarios) + 1)
    for i, scenario in enumerate(scenarios):
        pap = scenario[0]
        theta1, theta2 = linsp[i], linsp[i + 1]
        wedges[pap] = (theta1, theta2)

    if keywords['legend']:

        # set the linestyle for the legend
        if keywords['gain_linestyle'] == 'dotted':
            ls = ':'
        elif keywords['gain_linestyle'] == 'dashed':
            ls = '--'

        legendEntriesA = []
        legendTextA = []

        # add stuff for the legend
        for pap, gls in scenarios:
            w = mpl.patches.Wedge((0, 0),
                                  1,
                                  wedges[pap][0],
                                  wedges[pap][1],
                                  facecolor=colors[pap],
                                  zorder=1,
                                  linewidth=keywords['wedgeedgewidth'],
                                  edgecolor='black')
            legendEntriesA += [w]
            legendTextA += [pap]

        # second legend explains evolution
        legendEntriesB = []
        legendTextB = []
        p = mpl.patches.Wedge(
            (0, 0),
            1,
            0,
            360,
            facecolor='0.5',
            linewidth=keywords['wedgeedgewidth'],
            edgecolor='black',
        )
        legendEntriesB += [p]
        legendTextB += ['Loss Event']
        p, = plt.plot(0,
                      0,
                      ls,
                      color='black',
                      linewidth=keywords['wedgeedgewidth'])
        legendEntriesB += [p]
        legendTextB += ['Gain Event']

        # overwrite stuff
        plt.plot(0, 0, 'o', markersize=2, zorder=2, color='white')

    # iterate over the paps and append states to the graph
    for pap, gls in scenarios:

        # get the graph with the model
        g = gls2gml(gls, tgraph, tree, filename='')

        # iterate over the graph
        for n, d in g.nodes(data=True):

            # add the node if necessary
            if n not in graph:
                graph.add_node(n)

            # add a pap-dictionary if it's not already there
            if 'pap' not in graph.node[n]:
                graph.node[n]['pap'] = {}

            # add data
            graph.node[n]['pap'][pap] = d['state']

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    plt.axis('equal')

    xvals = []
    yvals = []

    # iterate over edges first
    for nA, nB in g.edges():
        gA = g.node[nA]['graphics']
        gB = g.node[nB]['graphics']
        xA, yA = gA['x'], gA['y']
        xB, yB = gB['x'], gB['y']

        plt.plot([xA, xB], [yA, yB],
                 '-',
                 color='black',
                 linewidth=keywords['edgewidth'])

    # add horizontal edges if this option is chosen
    if keywords['edges']:
        # get the coordinates
        for nA, nB in keywords['edges']:
            gA = g.node[nA]['graphics']
            gB = g.node[nB]['graphics']
            xA, yA = gA['x'], gA['y']
            xB, yB = gB['x'], gB['y']

            plt.plot([xA, xB], [yA, yB],
                     '-',
                     color=keywords['hedge_color'],
                     linewidth=keywords["hedge_width"],
                     linestyle=keywords['hedge_linestyle'])

    # now iterate over the nodes
    for n, d in graph.nodes(data=True):
        cpaps = d['pap']
        x, y = g.node[n]['graphics']['x'], g.node[n]['graphics']['y']

        # get z-value which serves as zorder attribute
        try:
            z = 6 * len(tree.getConnectingEdges('root', n))
        except:
            z = 0

        xvals += [x]
        yvals += [y]

        # plot the default marker
        plt.plot(x,
                 y,
                 'o',
                 markersize=keywords['rootsize'],
                 color='black',
                 zorder=50)
        # check for origins in cpaps
        if 'O' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor='white',
                zorder=57 + z,
                linewidth=keywords['markeredgewidth'],
                linestyle=keywords['gain_linestyle'],
            )
            figsp.add_artist(w)
        # check for retentions
        elif 'o' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor='white',
                zorder=56 + z,
                linewidth=keywords['markeredgewidth'],
                linestyle='solid',
            )
            figsp.add_artist(w)

        if 'L' in cpaps.values() and 'O' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor=keywords['loss_background'],
                zorder=58 + z,
                linewidth=keywords['markeredgewidth'],
                edgecolor='black',
                linestyle=keywords['loss_linestyle'])
            figsp.add_artist(w)

        elif "L" in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor=keywords['loss_background'],
                zorder=59 + z,
                linewidth=keywords['markeredgewidth'],
                edgecolor='black',
            )
            figsp.add_artist(w)

        # plot all wedges
        for pap in cpaps:

            theta1, theta2 = wedges[pap]
            color = colors[pap]

            # check for characteristics of this pap

            # if it's a loss
            if cpaps[pap] == 'L':

                w = mpl.patches.Wedge(
                    (x, y),
                    keywords['radius'],
                    theta1,
                    theta2,
                    facecolor=color,
                    zorder=61 + z,
                    alpha=keywords['loss_alpha'],  # 0.25,
                    linewidth=keywords['wedgeedgewidth'],
                    edgecolor='black',
                    linestyle=keywords['loss_linestyle'])
                figsp.add_artist(w)

            elif cpaps[pap] == 'o':

                w = mpl.patches.Wedge((x, y),
                                      keywords['radius'],
                                      theta1,
                                      theta2,
                                      facecolor=color,
                                      zorder=61 + z,
                                      linewidth=keywords['wedgeedgewidth'],
                                      edgecolor='black')
                figsp.add_artist(w)

            elif cpaps[pap] == 'O':

                w = mpl.patches.Wedge((x, y),
                                      keywords['radius'],
                                      theta1,
                                      theta2,
                                      facecolor=color,
                                      zorder=61 + z,
                                      linewidth=keywords['wedgeedgewidth'],
                                      edgecolor='black',
                                      linestyle=keywords['gain_linestyle'])
                figsp.add_artist(w)

                # add the labels if this option is chosen
        if keywords['labels']:
            # if node is a tip
            if tgraph.node[n]['tip']:

                # get the values
                gf = tgraph.node[n]['graphics']
                r = gf['angle']
                x, y = gf['x'], gf['y']
                ha = gf['s']

                # modify the text
                if ha == 'left':
                    text = keywords['_prefix'] + labels[n]
                else:
                    text = labels[n] + keywords['_suffix']

                # plot the text
                plt.text(x,
                         y,
                         text,
                         size=keywords['textsize'],
                         va='center',
                         ha=ha,
                         fontweight='bold',
                         color='black',
                         rotation=r,
                         rotation_mode='anchor',
                         zorder=z)

    # set up the xlimits
    if not keywords['xlimr'] and not keywords['xliml']:
        xl, xr = 2 * [keywords['xlim']]
    else:
        xl, xr = keywords['xliml'], keywords['xlimr']

    # set up the xlimits
    if not keywords['ylimt'] and not keywords['ylimb']:
        yb, yt = 2 * [keywords['ylim']]
    else:
        yb, yt = keywords['ylimb'], keywords['ylimt']

    plt.xlim((min(xvals) - xl, max(xvals) + xr))
    plt.ylim((min(yvals) - yb, max(yvals) + yt))

    prop = mpl.font_manager.FontProperties(size=keywords['legendsize'])

    if keywords['legend']:
        legend1 = plt.legend(legendEntriesA,
                             legendTextA,
                             loc=keywords['legendAloc'],
                             numpoints=1,
                             prop=prop)
        plt.legend(legendEntriesB,
                   legendTextB,
                   loc=keywords['legendBloc'],
                   prop=prop)
        figsp.add_artist(legend1)

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])

    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 21
0
def diff(wordlist,
         gold='cogid',
         test='lexstatid',
         modify_ref=False,
         pprint=True,
         filename='',
         tofile=True,
         transcription="ipa"):
    r"""
    Write differences in classifications on an item-basis to file.

    lex : :py:class:`lingpy.compare.lexstat.LexStat`
        The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the
        computation. It should have two columns indicating cognate IDs.
    gold : str (default='cogid')
        The name of the column containing the gold standard cognate
        assignments.
    test : str (default='lexstatid')
        The name of the column containing the automatically implemented cognate
        assignments.
    modify_ref : function (default=False)
        Use a function to modify the reference. If your cognate identifiers
        are numerical, for example, and negative values are assigned as
        loans, but you want to suppress this behaviour, just set this
        keyword to "abs", and all cognate IDs will be converted to their
        absolute value.
    pprint : bool (default=True)
        Print out the results
    filename : str (default='')
        Name of the output file. If not specified, it is identical with the
        name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the
        extension ``diff``.
    tofile : bool (default=True)
        If set to c{False}, no data will be written to file, but instead, the
        data will be returned.
    transcription : str (default="ipa")
        The file in which the transcriptions are located (should be a string,
        no segmentized version, for convenience of writing to file).

    Returns
    -------
    t : tuple
        A nested tuple consisting of two further tuples. The first
        containing precision, recall, and harmonic mean
        (F-scores), the second containing the same values for the pair-scores.

    Notes
    -----
    If the **tofile** option is chosen, the results are written to a specific
    file with the extension ``diff``. This file contains all cognate sets in
    which there are differences between gold standard and test sets. It also
    gives detailed information regarding false positives, false negatives, and
    the words involved in these wrong decisions.

    .. This function also calculates the "transformation" score. This score is
    .. based on the calculation of steps that are needed to transform one cluster
    .. for one set of meanings into the other. Ideally, if there are *n* different
    .. cognate sets covering one gloss in the gold standard, the minimal length of
    .. a mapping to convert the *m* cognate sets of the test set into the gold standard
    .. is *n*. In this case, both gold standard and test set are identical.
    .. However, if gold standard and test set differ, the number of mappings
    .. necessarily exceeds *m* and *n*. Based on this, the transformation
    .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of
    .. distinct clusters in the test set and *M* is the length of the mapping.
    .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the
    .. number of clusters in the gold standard.

    .. Note that if precision is lower than 1.0, this means there are false
    .. positive decisions in the test set. Accordingly, a recall lower than 1.0
    .. indicates that there are false negative decisions in the test set.
    .. The drawback of this score is that it is not sensitive regarding the
    .. distinct number of decisions in which gold standard and test set differ, so
    .. the recall can be very low although most of the words have been grouped
    .. accurately. The advantage is that it can be directly interpreted in terms
    .. of 'false positive/false negative' decisions.

    See also
    --------
    bcubes
    pairs
    """
    filename = filename or wordlist.filename
    loan = modify_ref if modify_ref else identity

    # open file
    if tofile:
        f = codecs.open(filename + '.diff', 'w', 'utf-8')

    # get a formatter for language names
    lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}'

    preT, recT = [], []
    preB, recB = [], []
    preP, recP = [], []

    def get_pairs(cogs, idxs):
        tmp = defaultdict(list)
        for x, y in zip(cogs, idxs):
            tmp[x].append(y)
        for x in tmp:
            for yA, yB in combinations(tmp[x], r=2):
                yield tuple(sorted([yA, yB]))

    for concept in wordlist.rows:
        idxs = wordlist.get_list(row=concept, flat=True)
        # get the basic index for all seqs
        bidx = [i + 1 for i in range(len(idxs))]

        cogsG = _get_cogs(gold, concept, loan, wordlist)
        cogsT = _get_cogs(test, concept, loan, wordlist)

        if cogsG != cogsT:
            # calculate the transformation distance of the sets
            tramGT = len(set(zip(cogsG, cogsT)))
            tramG = len(set(cogsG))
            tramT = len(set(cogsT))
            preT += [tramT / tramGT]
            recT += [tramG / tramGT]

            # calculate the bcubed precision for the sets
            preB += [_get_bcubed_score(cogsT, cogsG)]

            # calculate b-cubed recall
            recB += [_get_bcubed_score(cogsG, cogsT)]

            # calculate pair precision
            pairsG = set(get_pairs(cogsG, idxs))
            pairsT = set(get_pairs(cogsT, idxs))

            preP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsT) if pairsT else 1.0)
            recP.append(
                len(pairsT.intersection(pairsG)) /
                len(pairsG) if pairsG else 1.0)
            fp = "no" if preP[-1] == 1.0 else "yes"
            fn = "no" if recP[-1] == 1.0 else "yes"

            if tofile:
                f.write(
                    "Concept: {0}, False Positives: {1}, False Negatives: {2}\n"
                    .format(concept, fp, fn))

            # get the words
            words = [wordlist[i, 'ipa'] for i in idxs]
            langs = [wordlist[i, 'taxa'] for i in idxs]

            # get a word-formater
            wform = '{0:' + str(max([len(w) for w in words])) + '}'

            # write differences to file
            if tofile:
                for word, lang, cG, cT in sorted(zip(words, langs, cogsG,
                                                     cogsT),
                                                 key=lambda x: (x[2], x[3])):
                    f.write('{0}\t{1}\t{2:4}\t{3:4}\n'.format(
                        lform.format(lang), wform.format(word), cG, cT))
                f.write('#\n')
        else:
            preT += [1.0]
            recT += [1.0]
            preB += [1.0]
            recB += [1.0]
            preP += [1.0]
            recP += [1.0]

    bp = sum(preB) / len(preB)
    br = sum(recB) / len(recB)
    bf = 2 * (bp * br) / (bp + br)
    pp = sum(preP) / len(preP)
    pr = sum(recP) / len(recP)
    pf = 2 * (pp * pr) / (pp + pr)


    as_string(_format_results('B-Cubed', bp, br, bf) + \
            _format_results('Pair', pp, pr, pf),
            pprint=pprint)

    if tofile:
        f.write('B-Cubed Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(bp))
        f.write('Recall:    {0:.4f}\n'.format(br))
        f.write('F-Score:   {0:.4f}\n'.format(bf))
        f.write('#\n')
        f.write('Pair Scores:\n')
        f.write('Precision: {0:.4f}\n'.format(pp))
        f.write('Recall:    {0:.4f}\n'.format(pr))
        f.write('F-Score:   {0:.4f}\n'.format(pf))
        f.close()
        log.file_written(filename + '.diff')
    else:
        return (bp, br, bf), (pp, pr, pf)
Ejemplo n.º 22
0
def plot_concept_evolution(
    scenarios,
    tree,
    fileformat='pdf',
    degree=90,
    **keywords
):
    """
    Plot the evolution according to the MLN method of all words for a given concept.
    
    Parameters
    ----------
    tree : str
        A tree representation in Newick format.
    fileformat : str (default="pdf")
        A valid fileformat according to Matplotlib.
    degree : int (default=90)
        The degree by which the tree is drawn. 360 yields a circular tree, 180
        yields a tree filling half of the space of a circle.
    """

    # make defaults
    defaults = dict(
        figsize=(15, 15),
        left=0.05,
        top=0.95,
        bottom=0.05,
        right=0.95,
        colormap=mpl.cm.jet,
        edgewidth=5,
        radius=2.5,
        outer_radius=0.5,
        inner_radius=0.25,
        cognates='',
        usetex=False,
        latex_preamble=False,
        textsize=8,
        change=lambda x: x ** 1.75,
        xlim=0,
        ylim=0,
        xlimr=False,
        xliml=False,
        ylimt=False,
        ylimb=False,
        rootsize=10,
        legend=True,
        legendsize=5,
        legendAloc='upper right',
        legendBloc='lower right',
        markeredgewidth=2.5,
        wedgeedgewidth=2,
        gain_linestyle='dotted',
        loss_linestyle='solid',
        ax_linewidth=0,
        labels={},
        _prefix='-   ',
        _suffix='   -',
        colors={},
        start=0,
        filename=rcParams['filename'],
        loss_alpha=0.1,
        loss_background='0.75',
        edges=[],
        hedge_color="black",
        hedge_width=5,
        hedge_linestyle='dashed',
    )
    keywords.update(defaults)

    # set filename as variable for convenience
    filename = keywords['filename']

    # XXX customize later XXX
    colormap = keywords['colormap']

    # switch backend, depending on whether tex is used or not
    backend = mpl.get_backend()
    if keywords['usetex'] and backend != 'pgf':
        plt.switch_backend('pgf')
    elif not keywords['usetex'] and backend != 'TkAgg':
        plt.switch_backend('TkAgg')

    # check for preamble settings
    if keywords['latex_preamble']:
        mpl.rcParams['pgf.preamble'] = keywords['latex_preamble']

    # make a graph
    graph = nx.Graph()

    # get the tgraph
    tgraph = radial_layout(
        tree,
        degree=degree,
        change=keywords['change'],
        start=keywords['start']
    )

    # get the taxa
    taxa = [n[0] for n in tgraph.nodes(data=True) if n[1]['tip']]

    # set the labels
    labels = {}
    for taxon in taxa:
        if taxon in keywords['labels']:
            labels[taxon] = keywords['labels'][taxon]
        else:
            labels[taxon] = taxon

    # get the number of paps in order to get the right colors
    cfunc = np.array(np.linspace(10, 256, len(scenarios)), dtype='int')

    if not keywords['colors']:
        colors = {scenarios[i][0]: mpl.colors.rgb2hex(colormap(cfunc[i]))
                  for i in range(len(scenarios))}
    else:
        colors = keywords['colors']

    # get the wedges for the paps
    wedges = {}
    linsp = np.linspace(0, 360, len(scenarios) + 1)
    for i, scenario in enumerate(scenarios):
        pap = scenario[0]
        theta1, theta2 = linsp[i], linsp[i + 1]
        wedges[pap] = (theta1, theta2)

    if keywords['legend']:

        # set the linestyle for the legend
        if keywords['gain_linestyle'] == 'dotted':
            ls = ':'
        elif keywords['gain_linestyle'] == 'dashed':
            ls = '--'

        legendEntriesA = []
        legendTextA = []

        # add stuff for the legend
        for pap, gls in scenarios:
            w = mpl.patches.Wedge(
                (0, 0),
                1,
                wedges[pap][0],
                wedges[pap][1],
                facecolor=colors[pap],
                zorder=1,
                linewidth=keywords['wedgeedgewidth'],
                edgecolor='black'
            )
            legendEntriesA += [w]
            legendTextA += [pap]

        # second legend explains evolution
        legendEntriesB = []
        legendTextB = []
        p = mpl.patches.Wedge(
            (0, 0),
            1,
            0,
            360,
            facecolor='0.5',
            linewidth=keywords['wedgeedgewidth'],
            edgecolor='black',
        )
        legendEntriesB += [p]
        legendTextB += ['Loss Event']
        p, = plt.plot(
            0, 0,
            ls,
            color='black',
            linewidth=keywords['wedgeedgewidth']
        )
        legendEntriesB += [p]
        legendTextB += ['Gain Event']

        # overwrite stuff
        plt.plot(0, 0, 'o', markersize=2, zorder=2, color='white')

    # iterate over the paps and append states to the graph
    for pap, gls in scenarios:

        # get the graph with the model
        g = gls2gml(
            gls,
            tgraph,
            tree,
            filename=''
        )

        # iterate over the graph
        for n, d in g.nodes(data=True):

            # add the node if necessary
            if n not in graph:
                graph.add_node(n)

            # add a pap-dictionary if it's not already there
            if 'pap' not in graph.node[n]:
                graph.node[n]['pap'] = {}

            # add data
            graph.node[n]['pap'][pap] = d['state']

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    plt.axis('equal')

    xvals = []
    yvals = []

    # iterate over edges first
    for nA, nB in g.edges():
        gA = g.node[nA]['graphics']
        gB = g.node[nB]['graphics']
        xA, yA = gA['x'], gA['y']
        xB, yB = gB['x'], gB['y']

        plt.plot(
            [xA, xB],
            [yA, yB],
            '-',
            color='black',
            linewidth=keywords['edgewidth']
        )

    # add horizontal edges if this option is chosen
    if keywords['edges']:
        # get the coordinates
        for nA, nB in keywords['edges']:
            gA = g.node[nA]['graphics']
            gB = g.node[nB]['graphics']
            xA, yA = gA['x'], gA['y']
            xB, yB = gB['x'], gB['y']

            plt.plot(
                [xA, xB],
                [yA, yB],
                '-',
                color=keywords['hedge_color'],
                linewidth=keywords["hedge_width"],
                linestyle=keywords['hedge_linestyle']
            )

    # now iterate over the nodes
    for n, d in graph.nodes(data=True):
        cpaps = d['pap']
        x, y = g.node[n]['graphics']['x'], g.node[n]['graphics']['y']

        # get z-value which serves as zorder attribute
        try:
            z = 6 * len(tree.getConnectingEdges('root', n))
        except:
            z = 0

        xvals += [x]
        yvals += [y]

        # plot the default marker
        plt.plot(
            x,
            y,
            'o',
            markersize=keywords['rootsize'],
            color='black',
            zorder=50
        )
        # check for origins in cpaps
        if 'O' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor='white',
                zorder=57 + z,
                linewidth=keywords['markeredgewidth'],
                linestyle=keywords['gain_linestyle'],
            )
            figsp.add_artist(w)
        # check for retentions
        elif 'o' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor='white',
                zorder=56 + z,
                linewidth=keywords['markeredgewidth'],
                linestyle='solid',
            )
            figsp.add_artist(w)

        if 'L' in cpaps.values() and 'O' in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor=keywords['loss_background'],
                zorder=58 + z,
                linewidth=keywords['markeredgewidth'],
                edgecolor='black',
                linestyle=keywords['loss_linestyle']
            )
            figsp.add_artist(w)

        elif "L" in cpaps.values():
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] + keywords['outer_radius'],
                0,
                360,
                facecolor=keywords['loss_background'],
                zorder=59 + z,
                linewidth=keywords['markeredgewidth'],
                edgecolor='black',
            )
            figsp.add_artist(w)

        # plot all wedges
        for pap in cpaps:

            theta1, theta2 = wedges[pap]
            color = colors[pap]

            # check for characteristics of this pap

            # if it's a loss
            if cpaps[pap] == 'L':

                w = mpl.patches.Wedge(
                    (x, y),
                    keywords['radius'],
                    theta1,
                    theta2,
                    facecolor=color,
                    zorder=61 + z,
                    alpha=keywords['loss_alpha'],  # 0.25,
                    linewidth=keywords['wedgeedgewidth'],
                    edgecolor='black',
                    linestyle=keywords['loss_linestyle']
                )
                figsp.add_artist(w)

            elif cpaps[pap] == 'o':

                w = mpl.patches.Wedge(
                    (x, y),
                    keywords['radius'],
                    theta1,
                    theta2,
                    facecolor=color,
                    zorder=61 + z,
                    linewidth=keywords['wedgeedgewidth'],
                    edgecolor='black'
                )
                figsp.add_artist(w)

            elif cpaps[pap] == 'O':

                w = mpl.patches.Wedge(
                    (x, y),
                    keywords['radius'],
                    theta1,
                    theta2,
                    facecolor=color,
                    zorder=61 + z,
                    linewidth=keywords['wedgeedgewidth'],
                    edgecolor='black',
                    linestyle=keywords['gain_linestyle']
                )
                figsp.add_artist(w)

                # add the labels if this option is chosen
        if keywords['labels']:
            # if node is a tip
            if tgraph.node[n]['tip']:

                # get the values
                gf = tgraph.node[n]['graphics']
                r = gf['angle']
                x, y = gf['x'], gf['y']
                ha = gf['s']

                # modify the text
                if ha == 'left':
                    text = keywords['_prefix'] + labels[n]
                else:
                    text = labels[n] + keywords['_suffix']

                # plot the text
                plt.text(
                    x,
                    y,
                    text,
                    size=keywords['textsize'],
                    va='center',
                    ha=ha,
                    fontweight='bold',
                    color='black',
                    rotation=r,
                    rotation_mode='anchor',
                    zorder=z
                )

    # set up the xlimits
    if not keywords['xlimr'] and not keywords['xliml']:
        xl, xr = 2 * [keywords['xlim']]
    else:
        xl, xr = keywords['xliml'], keywords['xlimr']

    # set up the xlimits
    if not keywords['ylimt'] and not keywords['ylimb']:
        yb, yt = 2 * [keywords['ylim']]
    else:
        yb, yt = keywords['ylimb'], keywords['ylimt']

    plt.xlim((min(xvals) - xl, max(xvals) + xr))
    plt.ylim((min(yvals) - yb, max(yvals) + yt))

    prop = mpl.font_manager.FontProperties(size=keywords['legendsize'])

    if keywords['legend']:
        legend1 = plt.legend(
            legendEntriesA,
            legendTextA,
            loc=keywords['legendAloc'],
            numpoints=1,
            prop=prop
        )
        plt.legend(
            legendEntriesB,
            legendTextB,
            loc=keywords['legendBloc'],
            prop=prop
        )
        figsp.add_artist(legend1)

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )

    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)
Ejemplo n.º 23
0
def plot_heatmap(
    wordlist,
    filename="heatmap",
    fileformat="pdf",
    ref='cogid',
    normalized=False,
    refB='',
    **keywords
):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False
    )
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False.")

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes(
            [
                keywords['left'],
                keywords['bottom'],
                0.25 * keywords['width'],
                keywords['height']
            ]
        )
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',

        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=ref
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=ref
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=ref
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=ref
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=refB
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=refB
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=refB
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=refB
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warning(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(
                        taxa=taxonA,
                        flat=True,
                        entry=ref
                    )
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes(
        [
            left,  # keywords['left']+0.25 * keywords['width']+0.05,
            keywords['bottom'],
            keywords['width'],
            keywords['height']
        ]
    )
    cmap = keywords['cmap'] 

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords and keywords['distances']:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]
    nmatrix = [
            [keywords['vmax'], keywords['vmin']],
            [keywords['vmin'], keywords['vmax']]
            ]

    im = ax2.matshow(nmatrix, aspect='auto', origin='lower',
                     interpolation='nearest', cmap=keywords['cmap'],
                     vmax=keywords['vmax'], vmin=keywords['vmin']
                     )

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])



    plt.xticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
        rotation=keywords['xrotation'],
        rotation_mode="default"
    )
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:

        plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)