Example #1
0
from csvw.metadata import TableGroup
from lingpy import util
from lingpy.convert.html import template_path

# receive the template path from lingpy for splitstree
tpath = util.Path(template_path('splitstree.nex'))
if tpath.exists:
    _template = util.read_text_file(tpath.as_posix())
else:  # pragma: no cover
    raise IOError("Unknown template %s" % template)

tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json')
taxa = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['languages.csv'])}
params = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['parameters.csv'])}
matrix = [[0 for p in params] for t in taxa]
for row in tbg.tabledict['values.csv']:
    tidx, tname = taxa[row['Language_ID']]
    pidx, pname = params[row['Parameter_ID']]
    if row['Value'] == '+':
        matrix[tidx][pidx] = 1
        
alpha = 'abcdefghijklmnopqrstuvwxyz'
alpha += alpha.upper()
alpha += '0123456789'

matrix_string = ''
tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0])
for i, line in enumerate(matrix):
    matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if 
Example #2
0
def write_nexus(taxa,
                matrix,
                custom=None,
                custom_name='lingpy',
                missing="?",
                gap="-",
                template="mrbayes.nex",
                filename="mrbayes.nex",
                dtype="RESTRICTION",
                symbols="10",
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    taxa : list
        The taxonomic units in your data. They should be valid taxon names,
        only consisting of alphanumeric characters and an underscore, usually
        also not exceeding a length of 15 characters.
    matrix : list
        The matrix with the values for each taxon in one separate row. Usually,
        the matrix contains binary values which can be passed as strings or
        integers (1 and 0), but missing values are also possible. Given
        biological common restrictions, each character can only be one ASCII
        symbol.
    custom : list {default=None)
        This information allows to add custom information to the nexus file,
        like, for example, the structure of the characters, their original concept, or their type, and it will be
        written into a custom block in the nexus file. The name of the custom
        block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into
        the custom block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    missing : str (default="?")
        The symbol for missing characters.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    template : str (default="mrbayes.nex")
        The name of the template file. This file is located in the template/
        folder of the LingPy package, but a custom file can be specified by
        providing the path.
    dtype : str (default="RESTRICTION")
        The datatype, which is usually "STANDARD" or "RESTRICTION" in
        linguistic analyses, with "RESTRICTION" pointing to pure birth-death
        models.
    symbols : str (default="10")
        The symbols used for the characters.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    """
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:
        util.read_text_file(template)
    _commands = 'BEGIN {0};\n{1}\n\n'.format(
        commands_name, '\n'.join(commands)) if commands else ''
    _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name,
                                           '\n'.join(custom)) if custom else ''

    _matrix = ""
    mtl = max([len(t) for t in taxa]) + 1
    for i, (t, m) in enumerate(zip(taxa, matrix)):
        _matrix += str(t + mtl * ' ')[:mtl] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'

    text = _template.format(matrix=_matrix,
                            ntax=len(taxa),
                            nchar=len(matrix[0]),
                            gap=gap,
                            missing=missing,
                            dtype=dtype,
                            commands=_commands,
                            custom=_custom,
                            symbols=symbols)
    util.write_text_file(filename, text)
Example #3
0
def write_nexus(wordlist,
                mode='mrbayes',
                filename="mrbayes.nex",
                ref="cogid",
                missing="?",
                gap="-",
                custom=None,
                custom_name='lingpy',
                commands=None,
                commands_name="mrbayes"):
    """Write a nexus file for phylogenetic analyses.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    mode : str (default="mrbayes")
        The name of the output nexus style. Valid values are:
            * 'MRBAYES': a MrBayes formatted nexus file.
            * 'SPLITSTREE': a SPLITSTREE formatted nexus file.
            * 'BEAST': a BEAST formatted nexus file.
            * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned
               analyses.
            * 'TRAITLAB': a TRAITLab formatted nexus.
    filename : str (default=None)
        Name of the file to which the nexus file will be written.
        If set to c{None}, then this function will not write the nexus ontent
        to a file, but simply return the content as a string.
    ref: str (default="cogid")
        Column in which you store the cognate sets in your data.
    gap : str (default="-")
        The symbol for gaps (not relevant for linguistic analyses).
    missing : str (default="?")
        The symbol for missing characters.
    custom : list {default=None)
        This information allows to add custom information to the nexus file, like, for
        example, the structure of the characters, their original concept, or their
        type, and it will be written into a custom block in the nexus file. The name of
        the custom block can be specified with help of the `custom_name` keyword. The
        content is a list of strings which will be written line by line into the custom
        block.
    custom_name : str (default="lingpy")
        The name of the custom block which will be written to the file.
    commands : list (default=None)
        If specified, will write an additional block containing commands for
        phylogenetic software. The commands are passed as a list, containing
        strings. The name of the block is given by the keywords commands_name.
    commands_name : str (default="mrbayes")
        Determines how the block will be called to which the commands will be
        written.

    Returns
    -------
    nexus : str
        A string containing nexus file output
    """
    templates = {
        'BEAST': 'beast.nex',
        'BEASTWORDS': 'beast.nex',
        'SPLITSTREE': 'splitstree.nex',
        'MRBAYES': 'mrbayes.nex',
        'TRAITLAB': 'splitstree.nex',
    }

    block = "\n\nBEGIN {0};\n{1}\nEND;\n"  # template for nexus blocks

    # check for valid mode
    mode = mode.upper()
    if mode not in templates.keys():
        raise ValueError("Unknown output mode %s" % mode)

    # check for valid template
    template = templates.get(mode)
    tpath = util.Path(template_path(template))
    if tpath.exists:
        _template = util.read_text_file(tpath.as_posix())
    else:  # pragma: no cover
        raise IOError("Unknown template %s" % template)

    # check that `ref` is a valid column
    if ref not in wordlist._alias:
        raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref)

    # retrieve the matrix
    matrix = [[] for x in range(wordlist.width)]
    etd = wordlist.get_etymdict(ref=ref)
    concepts = sorted(
        [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx])
         for (cogid, vals) in etd.items()],
        key=lambda x: (x[1], x[0]))
    # and missing data..
    missing_ = {
        t: [
            concept for (cogid, concept) in concepts if concept not in
            wordlist.get_list(col=t, entry=wordlist._row_name, flat=True)
        ]
        for t in wordlist.cols
    }

    # add ascertainment character for mode=BEAST
    if mode == 'BEAST':
        matrix = [['0'] for m in matrix]

    # skip the constant sites for traitlab
    if mode == 'TRAITLAB':
        concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])]

    # fill matrix
    for i, t in enumerate(wordlist.cols):
        previous = ''
        for cogid, concept in concepts:
            if previous != concept:
                previous = concept
                # add ascertainment character for mode=BEASTWORDS. Note that if
                # a given word:language is missing, then its ascertainment
                # character is the `missing` character.
                if mode == "BEASTWORDS":
                    matrix[i] += ['0'] if concept not in missing_[t] else [
                        missing
                    ]
            matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \
                missing_[t] else [missing]

    # parse characters into `charsets` (a dict of word=>siteindex positions),
    # and `chars` (a list of characters).
    charsets, chars, previous = defaultdict(list), [], ''
    for i, (cogid, concept) in enumerate(concepts, 1):
        char = util.nexus_slug(concept)
        # add label for ascertainment character in BEAST mode
        if i == 1 and mode == 'BEAST':
            chars.append("_ascertainment")
        # add label for per-word ascertainment characters in BEASTWORDS
        if mode == 'BEASTWORDS' and previous != concept:
            chars.append("%s_ascertainment" % char)
            charsets[char].append(len(chars))
        # finally add label.
        chars.append(char)
        charsets[char].append(len(chars))
        previous = concept

    # create character labels block if needed
    if mode in ('BEAST', 'BEASTWORDS'):
        charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)])
    else:
        charblock = ""

    # create charsets block
    blockname, assumptions = None, ""
    if mode in ('BEASTWORDS', 'MRBAYES'):
        charsets = [
            "\tcharset %s = %d-%d;" % (c, min(m), max(m))
            for (c, m) in charsets.items()
        ]
        blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES'
        assumptions = "\n".join(charsets)

    # commands
    if commands_name.upper() == blockname and len(assumptions) and commands:
        # merge commands specified in function call into output blockname
        assumptions += "\n" + "\n".join("\t%s" % c for c in commands)
    else:
        # different commands block set in commands_name.
        assumptions += block.format(commands_name,
                                    '\n'.join(commands)) if commands else ''

    # convert state matrix to string.
    _matrix = ""
    maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1
    for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)):
        _matrix += str(util.nexus_slug(taxon) +
                       maxtaxlen * ' ')[:maxtaxlen] + ' '
        _matrix += ''.join(
            ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n'
    _matrix = _matrix.rstrip()  # remove trailing

    # TODO: symbols could be more than "01" but we this function doesn't handle
    # multistate data so we just specify them here.
    symbols = '01'

    text = _template.format(
        matrix=_matrix,
        ntax=wordlist.width,
        nchar=len(matrix[0]),
        gap=gap,
        missing=missing,
        dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD',
        commands=block.format(blockname, assumptions),
        custom=block.format(custom_name, '\n'.join(custom)) if custom else '',
        symbols=symbols,
        chars=charblock)
    text = text.replace("\t", " " * 4)  # normalise tab-stops
    for i, (cogid, concept) in enumerate(concepts, 1):
        text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(
            i, cogid, concept)
    if filename:
        util.write_text_file(filename, text)
    return text