from csvw.metadata import TableGroup from lingpy import util from lingpy.convert.html import template_path # receive the template path from lingpy for splitstree tpath = util.Path(template_path('splitstree.nex')) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json') taxa = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['languages.csv'])} params = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['parameters.csv'])} matrix = [[0 for p in params] for t in taxa] for row in tbg.tabledict['values.csv']: tidx, tname = taxa[row['Language_ID']] pidx, pname = params[row['Parameter_ID']] if row['Value'] == '+': matrix[tidx][pidx] = 1 alpha = 'abcdefghijklmnopqrstuvwxyz' alpha += alpha.upper() alpha += '0123456789' matrix_string = '' tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0]) for i, line in enumerate(matrix): matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if
def write_nexus(taxa, matrix, custom=None, custom_name='lingpy', missing="?", gap="-", template="mrbayes.nex", filename="mrbayes.nex", dtype="RESTRICTION", symbols="10", commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- taxa : list The taxonomic units in your data. They should be valid taxon names, only consisting of alphanumeric characters and an underscore, usually also not exceeding a length of 15 characters. matrix : list The matrix with the values for each taxon in one separate row. Usually, the matrix contains binary values which can be passed as strings or integers (1 and 0), but missing values are also possible. Given biological common restrictions, each character can only be one ASCII symbol. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. missing : str (default="?") The symbol for missing characters. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). template : str (default="mrbayes.nex") The name of the template file. This file is located in the template/ folder of the LingPy package, but a custom file can be specified by providing the path. dtype : str (default="RESTRICTION") The datatype, which is usually "STANDARD" or "RESTRICTION" in linguistic analyses, with "RESTRICTION" pointing to pure birth-death models. symbols : str (default="10") The symbols used for the characters. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. """ tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: util.read_text_file(template) _commands = 'BEGIN {0};\n{1}\n\n'.format( commands_name, '\n'.join(commands)) if commands else '' _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name, '\n'.join(custom)) if custom else '' _matrix = "" mtl = max([len(t) for t in taxa]) + 1 for i, (t, m) in enumerate(zip(taxa, matrix)): _matrix += str(t + mtl * ' ')[:mtl] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' text = _template.format(matrix=_matrix, ntax=len(taxa), nchar=len(matrix[0]), gap=gap, missing=missing, dtype=dtype, commands=_commands, custom=_custom, symbols=symbols) util.write_text_file(filename, text)
def write_nexus(wordlist, mode='mrbayes', filename="mrbayes.nex", ref="cogid", missing="?", gap="-", custom=None, custom_name='lingpy', commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. mode : str (default="mrbayes") The name of the output nexus style. Valid values are: * 'MRBAYES': a MrBayes formatted nexus file. * 'SPLITSTREE': a SPLITSTREE formatted nexus file. * 'BEAST': a BEAST formatted nexus file. * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned analyses. * 'TRAITLAB': a TRAITLab formatted nexus. filename : str (default=None) Name of the file to which the nexus file will be written. If set to c{None}, then this function will not write the nexus ontent to a file, but simply return the content as a string. ref: str (default="cogid") Column in which you store the cognate sets in your data. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). missing : str (default="?") The symbol for missing characters. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. Returns ------- nexus : str A string containing nexus file output """ templates = { 'BEAST': 'beast.nex', 'BEASTWORDS': 'beast.nex', 'SPLITSTREE': 'splitstree.nex', 'MRBAYES': 'mrbayes.nex', 'TRAITLAB': 'splitstree.nex', } block = "\n\nBEGIN {0};\n{1}\nEND;\n" # template for nexus blocks # check for valid mode mode = mode.upper() if mode not in templates.keys(): raise ValueError("Unknown output mode %s" % mode) # check for valid template template = templates.get(mode) tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) # check that `ref` is a valid column if ref not in wordlist._alias: raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref) # retrieve the matrix matrix = [[] for x in range(wordlist.width)] etd = wordlist.get_etymdict(ref=ref) concepts = sorted( [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in etd.items()], key=lambda x: (x[1], x[0])) # and missing data.. missing_ = { t: [ concept for (cogid, concept) in concepts if concept not in wordlist.get_list(col=t, entry=wordlist._row_name, flat=True) ] for t in wordlist.cols } # add ascertainment character for mode=BEAST if mode == 'BEAST': matrix = [['0'] for m in matrix] # skip the constant sites for traitlab if mode == 'TRAITLAB': concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])] # fill matrix for i, t in enumerate(wordlist.cols): previous = '' for cogid, concept in concepts: if previous != concept: previous = concept # add ascertainment character for mode=BEASTWORDS. Note that if # a given word:language is missing, then its ascertainment # character is the `missing` character. if mode == "BEASTWORDS": matrix[i] += ['0'] if concept not in missing_[t] else [ missing ] matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \ missing_[t] else [missing] # parse characters into `charsets` (a dict of word=>siteindex positions), # and `chars` (a list of characters). charsets, chars, previous = defaultdict(list), [], '' for i, (cogid, concept) in enumerate(concepts, 1): char = util.nexus_slug(concept) # add label for ascertainment character in BEAST mode if i == 1 and mode == 'BEAST': chars.append("_ascertainment") # add label for per-word ascertainment characters in BEASTWORDS if mode == 'BEASTWORDS' and previous != concept: chars.append("%s_ascertainment" % char) charsets[char].append(len(chars)) # finally add label. chars.append(char) charsets[char].append(len(chars)) previous = concept # create character labels block if needed if mode in ('BEAST', 'BEASTWORDS'): charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)]) else: charblock = "" # create charsets block blockname, assumptions = None, "" if mode in ('BEASTWORDS', 'MRBAYES'): charsets = [ "\tcharset %s = %d-%d;" % (c, min(m), max(m)) for (c, m) in charsets.items() ] blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES' assumptions = "\n".join(charsets) # commands if commands_name.upper() == blockname and len(assumptions) and commands: # merge commands specified in function call into output blockname assumptions += "\n" + "\n".join("\t%s" % c for c in commands) else: # different commands block set in commands_name. assumptions += block.format(commands_name, '\n'.join(commands)) if commands else '' # convert state matrix to string. _matrix = "" maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1 for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)): _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' _matrix = _matrix.rstrip() # remove trailing # TODO: symbols could be more than "01" but we this function doesn't handle # multistate data so we just specify them here. symbols = '01' text = _template.format( matrix=_matrix, ntax=wordlist.width, nchar=len(matrix[0]), gap=gap, missing=missing, dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD', commands=block.format(blockname, assumptions), custom=block.format(custom_name, '\n'.join(custom)) if custom else '', symbols=symbols, chars=charblock) text = text.replace("\t", " " * 4) # normalise tab-stops for i, (cogid, concept) in enumerate(concepts, 1): text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format( i, cogid, concept) if filename: util.write_text_file(filename, text) return text