def test_write_text_file(self): def lines_generator(n): for i in range(n): yield 'line%s' % i path = self.tmp_path('test') util.write_text_file(path, 'test') self.assertEqual(util.read_text_file(path), 'test') util.write_text_file(path, ['line1', 'line2']) self.assertEqual(len(util.read_text_file(path, lines=True)), 2) util.write_text_file(path, lines_generator(5)) self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
def test_write_text_file(tmppath): def lines_generator(n): for i in range(n): yield 'line%s' % i path = tmppath / 'test' util.write_text_file(path, 'test') assert util.read_text_file(path) == 'test' util.write_text_file(path, ['line1', 'line2']) assert len(util.read_text_file(path, lines=True)) == 2 util.write_text_file(path, lines_generator(5)) assert len(util.read_text_file(path, lines=True)) == 5
def triple2tsv(triples_or_fname, output="table"): """ Function reads a triple file and converts it to a tabular data structure. """ D = defaultdict(dict) idxs = set() cols = set() if not isinstance(triples_or_fname, list): triples_or_fname = util.read_text_file( triples_or_fname, normalize='NFD', lines=True) for line in triples_or_fname: if isinstance(line, (text_type, str)): line = line.split('\t') a, b, c = line D[a][b.upper()] = c idxs.add(a) cols.add(b.upper()) idxs = sorted(idxs) cols = sorted(cols) table = [[idx] + [ D.get(idx, {}).get(col, '') for col in cols] for idx in idxs] if output not in ['wordlist', 'dict']: return [["ID"] + cols] + table wlD = {int(line[0]): line[1:] for line in table} wlD[0] = cols return wlD
def triple2tsv(triples_or_fname, output="table"): """ Function reads a triple file and converts it to a tabular data structure. """ D = defaultdict(dict) idxs = set() cols = set() if not isinstance(triples_or_fname, list): triples_or_fname = util.read_text_file(triples_or_fname, normalize='NFD', lines=True) for line in triples_or_fname: if isinstance(line, str): line = line.split('\t') a, b, c = line D[a][b.upper()] = c idxs.add(a) cols.add(b.upper()) idxs = sorted(idxs) cols = sorted(cols) table = [[idx] + [D.get(idx, {}).get(col, '') for col in cols] for idx in idxs] if output not in ['wordlist', 'dict']: return [["ID"] + cols] + table wlD = {int(line[0]): line[1:] for line in table} wlD[0] = cols return wlD
def read_dst(filename, taxlen=10, comment='#'): """ Function reads files in Phylip dst-format. Parameters ---------- filename : string Name of the file which should have the extension ``dst``. taxlen : int (default=10) Indicate how long the taxon names are allowed to be in the file from which you want to read. The Phylip package only allows taxon names consisting of maximally 10 characters (this is the default). Other packages, however, allow more. If Phylip compatibility is not important for you and you just want to allow for as long taxon names as possible, set this value to 0 and make sure to use tabstops as separators between values in your matrix file. comment : str (default = '#') The comment character to be used if your file contains additional information which should be ignored. Returns ------- data : tuple A tuple consisting of a list of taxa and a matrix. """ if '\n' in filename: lines = [f for f in filename.split('\n') if f.strip()] else: lines = read_text_file(filename, normalize="NFC", lines=True) taxa, matrix = [], [] for line in lines[1:]: if not line.startswith(comment): if taxlen > 0: taxa.append(line[:taxlen].strip()) matrix.append([ float(val) for val in re.split(r'\s+', line[taxlen + 1:].strip()) ]) else: splits = line.split('\t') taxa.append(splits[0]) matrix.append([float(val.strip()) for val in splits[1:]]) return taxa, matrix
def read_dst(filename, taxlen=10, comment='#'): """ Function reads files in Phylip dst-format. Parameters ---------- filename : string Name of the file which should have the extension ``dst``. taxlen : int (default=10) Indicate how long the taxon names are allowed to be in the file from which you want to read. The Phylip package only allows taxon names consisting of maximally 10 characters (this is the default). Other packages, however, allow more. If Phylip compatibility is not important for you and you just want to allow for as long taxon names as possible, set this value to 0 and make sure to use tabstops as separators between values in your matrix file. comment : str (default = '#') The comment character to be used if your file contains additional information which should be ignored. Returns ------- data : tuple A tuple consisting of a list of taxa and a matrix. """ if '\n' in filename: lines = [f for f in filename.split('\n') if f.strip()] else: lines = read_text_file(filename, normalize="NFC", lines=True) taxa, matrix = [], [] for line in lines[1:]: if not line.startswith(comment): if taxlen > 0: taxa.append(line[:taxlen].strip()) matrix.append([float(val) for val in re.split(r'\s+', line[taxlen + 1:].strip())]) else: splits = line.split('\t') taxa.append(splits[0]) matrix.append([float(val.strip()) for val in splits[1:]]) return taxa, matrix
def read_msa(infile, comment="#", ids=False, header=True, normalize=True, **keywords): """ Simple function to load an MSA object. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. ids : bool (default=False) Indicate whether the MSA file contains unique IDs for all sequences or not. Returns ------- d : dict A dictionary in which keys correspond to specific parts of a multiple alignment. This dictionary can be directly passed to alignment functions, such as :py:class:`lingpy.sca.MSA`. """ if 'input_file' not in keywords: keywords['input_file'] = infile f = read_text_file(infile, normalize='NFC', lines=True) msa_lines = [] for line in f: if line.strip() and not line.startswith(comment): newlines = [t.strip().rstrip('.') for t in line.split('\t')] if len(newlines) == 1: msa_lines += newlines else: msa_lines += [newlines] return _list2msa(msa_lines, header=header, ids=ids, normalize=normalize, **keywords)
def __init__(self, model, path=None): new_path = lambda *cmps: \ os.path.join(path or util.data_path('models'), model, *cmps) self.name = model # try to load the converter try: self.converter = cache.load(model + '.converter') except: compile_model(model, path) self.converter = cache.load(model + '.converter') # give always preference to scorer matrix files if os.path.isfile(new_path('matrix')): self.scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer.bin')): try: self.scorer = cache.load(model + '.scorer') except compat.FileNotFoundError: pass # if none of the above fits, leave it else: pass # read information from the info-file self.info = {} info = util.read_text_file(new_path('INFO')) data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones'] for line in data: try: self.info[line] = re.findall('@' + line + ': (.*)', info)[0] except: self.info[line] = 'unknown' # check for vowels and tones if "vowels" in self.info: self.vowels = self.info['vowels'] if "tones" in self.info: self.tones = self.info['tones']
def parser(filename): text = read_text_file(filename, normalize='NFD', lines=True) comment = '#' data = { 0: [ 'poem', 'poem_number', 'stanza', 'line_in_source', 'line', 'line_order', 'rhymeids', 'alignment', 'refrain', 'chords' ] } meta, M = {}, {} number, stanza, idx, order = 0, 0, 1, 1 atzone = False for line in text: if line.startswith('@'): if not atzone: meta = {} atzone = True meta[line[1:line.index(':')]] = line[line.index(':') + 1:].strip() stanza = 0 elif not line.strip(): stanza += 1 order = 1 if atzone: number += 1 atzone = False M[meta.get('title', 'poem-{0}'.format(number))] = { k: v for k, v in meta.items() } rhymes = {0: 0} elif line.startswith('[') and line.endswith(']'): pass else: if comment in line: line = line[line.index(comment):] refrain = '' if line.startswith(' '): refrain = 'R' if [x for x in line if is_chinese(x)]: nline, bracket = [], 0 for char in line: if is_chinese(char): if bracket: bracket -= 1 nline[-1] += char else: nline += [char] else: if char == '[': bracket += 1 nline += [''] nline[-1] += char else: nline = line.strip().split() rhymeids, alignment, nline, chords = parse_line(nline, rhymes) data[idx] = [ meta.get('title', 'poem-{0}'.format(number)), str(number), '{0}.{1}'.format(number, stanza), line, ' + '.join(nline), order, rhymeids, ' + '.join(alignment), refrain, chords ] idx += 1 order += 1 poe = Poems(data) poe._meta['poems'] = M return poe
def test_TextFile(self): path = self.tmp_path('test') with util.TextFile(path) as fp: fp.writelines(['line1\n', 'line2\n']) self.assertEqual(len(util.read_text_file(path, lines=True)), 2)
def csv2list( filename, fileformat='', dtype=None, comment='#', sep='\t', strip_lines=True, header=False ): """ Very simple function to get quick (and somewhat naive) access to CSV-files. Parameters ---------- filename : str Name of the input file. fileformat : {None str} If not specified the file <filename> will be loaded. Otherwise, the fileformat is interpreted as the specific extension of the input file. dtype : {list} If not specified, all data will be loaded as strings. Otherwise, a list specifying the data for each line should be provided. comment : string (default="#") Comment character in the begin of a line forces this line to be ignored (set to None if you want to parse all lines of your file). sep : string (default = "\t") Specify the separator for the CSV-file. strip_lines : bool (default=True) Specify whether empty "cells" in the input file should be preserved. If set to c{False}, each line will be stripped first, and all whitespace will be cleaned. Otherwise, each line will be separated using the specified separator, and no stripping of whitespace will be carried out. header : bool (default=False) Indicate, whether the data comes along with a header. Returns ------- l : list A list-representation of the CSV file. """ # check for correct fileformat if fileformat: infile = filename + '.' + fileformat else: infile = filename if dtype is None: dtype = [] l = [] # open the file infile = read_text_file(infile, lines=True, normalize="NFC") # check for header idx = 0 if header else -1 for i, line in enumerate(infile): if line and (not comment or not line.startswith(comment)) and idx != i: if strip_lines: cells = [c.strip() for c in line.strip().split(sep)] else: cells = [c.strip() for c in line.split(sep)] if not dtype: l += [cells] else: l += [[f(c) for f, c in zip(dtype, cells)]] return l
def _export( self, fileformat, sections=None, entries=None, entry_sep='', item_sep='', template='', exclude=None, entry_start='', entry_close='', **keywords): """ Export a wordlist to various file formats. """ if not sections: if fileformat == 'txt': sections = dict( h1=('concept', '\n# Concept: {0}\n'), h2=('cogid', '## Cognate-ID: {0}\n')) elif fileformat == 'tex': sections = dict( h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'), h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n')) elif fileformat == 'html': sections = dict( h1=('concept', '<h1>Concept: {0}</h1>'), h2=('cogid', '<h2>Cognate Set: {0}</h2>')) if not entries: if fileformat == 'txt': entries = [('language', '{0} '), ('ipa', '{0}\n')] elif fileformat == 'tex': entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')] elif fileformat == 'html': entries = [('language', '{0} '), ('ipa', '[{0}]\n')] util.setdefaults(keywords, filename=rcParams['filename']) # get the temporary dictionary out = wl2dict(self, sections, entries, exclude) # assign the output string out_string = '' # iterate over the dictionary and start to fill the string for key in sorted(out, key=lambda x: str(x).lower()): # write key to file out_string += key[1] # reassign tmp tmp = out[key] # set the pointer and the index pointer = {0: [tmp, sorted(tmp.keys())]} while True: idx = max(pointer.keys()) # check for type of current point if isinstance(tmp, dict): if pointer[idx][1]: next_key = pointer[idx][1].pop() out_string += next_key[1] tmp = pointer[idx][0][next_key] if isinstance(tmp, dict): pointer[idx + 1] = [tmp, sorted(tmp.keys())] else: pointer[idx + 1] = [tmp, tmp] else: del pointer[idx] if idx == 0: break else: tmp_strings = [] for line in sorted(tmp): tmp_strings += [item_sep.join(line)] out_string += entry_start + entry_sep.join(tmp_strings) + entry_close tmp = pointer[idx - 1][0] del pointer[idx] if fileformat == 'tex': out_string = out_string.replace('_', r'\_') tmpl = util.read_text_file(template) if template else '{0}' _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
def cognate_detection(self, **keywords): """ Method runs a cognate detection analysis. """ kw = dict( align_method='progressive', align_mode=rcParams['align_mode'], align_modes=rcParams['align_modes'], cluster_method=rcParams['lexstat_cluster_method'], cognate_method='sca', cognate_mode='overlap', defaults=False, factor=rcParams['align_factor'], gap_weight=rcParams['gap_weight'], gop=rcParams['align_gop'], iteration=False, lexstat_modes=rcParams['lexstat_modes'], limit=rcParams['lexstat_limit'], merge_vowels=rcParams['merge_vowels'], model=rcParams['sca'], export="html", preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], rands=rcParams['lexstat_rands'], ratio=rcParams['lexstat_ratio'], ref="customid", restricted_chars=rcParams['restricted_chars'], restriction='', runs=rcParams['lexstat_runs'], scale=rcParams['align_scale'], scoring_method=rcParams['lexstat_scoring_method'], swap_check=False, threshold=rcParams['lexstat_threshold'], tree_calc=rcParams['align_tree_calc'], vscale=rcParams['lexstat_vscale'], outfile=False, sonar=True, ) # first load kw.update(keywords) if kw['defaults']: return kw # carry out lexstat cluster analysis self.lex = LexStat(self.infile, **kw) # reset filename if it is not defined kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy' # check for traditional lexstat analysis if kw['cognate_method'] == 'lexstat': self.lex.get_scorer(method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw) self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw) # align the data self.alms = Alignments(self.lex, **kw) kw['scoredict'] = self.lex.cscorer \ if kw['cognate_method'] == 'lexstat' else self.lex.bscorer self.alms.align(method=kw['align_method'], mode=kw['align_mode'], modes=kw['align_modes'], **kw) if 'tsv' in kw['export']: self.alms.output('tsv', filename=kw['outfile'], ignore=['scorer', 'json', 'taxa', 'msa'], **kw) if 'html' in kw['export']: corrs, occs = get_correspondences(self.alms, kw['ref']) # serialize the wordlist wl = {} for concept in self.alms.concepts: entries = self.alms.get_list(concept=concept, flat=True) cogids = [self.alms[idx, kw['ref']] for idx in entries] words = [self.alms[idx, 'ipa'] for idx in entries] alms = [self.alms[idx, 'alignment'] for idx in entries] langs = [self.alms[idx, 'doculect'] for idx in entries] checkalm = lambda x: x if type(x) == str else ' '.join(x) wl[concept] = [ list(k) for k in sorted(zip( langs, [str(x) for x in entries], words, [str(x) for x in cogids], [checkalm(x) for x in alms], ), key=lambda x: int(x[3])) ] # make simple gloss id for internal use as id gloss2id = list( zip(self.alms.concepts, [ str(x) for x in range(1, len(self.alms.concepts) + 1) ])) id2gloss = dict([[b, a] for a, b in gloss2id]) gloss2id = dict(gloss2id) txt = '' txt += 'CORRS = ' + json.dumps(corrs) + ';\n' txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n' txt += 'OCCS = ' + json.dumps(occs) + ';\n' txt += 'WLS = ' + json.dumps(wl) + ';\n' txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n' txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n' txt += 'FILE = "' + kw['outfile'] + '.tsv";\n' tpath = partial(util.data_path, 'templates') tname = 'jcov.{0}.html'.format('remote' if 'remote' in kw['export'] else 'direct') content = util.read_text_file(tpath(tname)) util.write_text_file( kw['outfile'] + '.html', content.format( CORRS=txt, JCOV=util.read_text_file(tpath('jcov.js')), STYLE=util.read_text_file(tpath('jcov.css')), VENDOR=util.read_text_file(tpath('jcov.vendor.js')), DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def _read(filename, normalize=None): res = {} for line in util.read_text_file(filename, normalize=normalize, lines=True): k, v = line.split(' : ') res[k] = v.split(', ') return res
def write_nexus(wordlist, mode='mrbayes', filename="mrbayes.nex", ref="cogid", missing="?", gap="-", custom=None, custom_name='lingpy', commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. mode : str (default="mrbayes") The name of the output nexus style. Valid values are: * 'MRBAYES': a MrBayes formatted nexus file. * 'SPLITSTREE': a SPLITSTREE formatted nexus file. * 'BEAST': a BEAST formatted nexus file. * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned analyses. * 'TRAITLAB': a TRAITLab formatted nexus. filename : str (default=None) Name of the file to which the nexus file will be written. If set to c{None}, then this function will not write the nexus ontent to a file, but simply return the content as a string. ref: str (default="cogid") Column in which you store the cognate sets in your data. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). missing : str (default="?") The symbol for missing characters. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. Returns ------- nexus : str A string containing nexus file output """ templates = { 'BEAST': 'beast.nex', 'BEASTWORDS': 'beast.nex', 'SPLITSTREE': 'splitstree.nex', 'MRBAYES': 'mrbayes.nex', 'TRAITLAB': 'splitstree.nex', } block = "\n\nBEGIN {0};\n{1}\nEND;\n" # template for nexus blocks # check for valid mode mode = mode.upper() if mode not in templates.keys(): raise ValueError("Unknown output mode %s" % mode) # check for valid template template = templates.get(mode) tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) # check that `ref` is a valid column if ref not in wordlist._alias: raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref) # retrieve the matrix matrix = [[] for x in range(wordlist.width)] etd = wordlist.get_etymdict(ref=ref) concepts = sorted( [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in etd.items()], key=lambda x: (x[1], x[0])) # and missing data.. missing_ = { t: [ concept for (cogid, concept) in concepts if concept not in wordlist.get_list(col=t, entry=wordlist._row_name, flat=True) ] for t in wordlist.cols } # add ascertainment character for mode=BEAST if mode == 'BEAST': matrix = [['0'] for m in matrix] # skip the constant sites for traitlab if mode == 'TRAITLAB': concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])] # fill matrix for i, t in enumerate(wordlist.cols): previous = '' for cogid, concept in concepts: if previous != concept: previous = concept # add ascertainment character for mode=BEASTWORDS. Note that if # a given word:language is missing, then its ascertainment # character is the `missing` character. if mode == "BEASTWORDS": matrix[i] += ['0'] if concept not in missing_[t] else [ missing ] matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \ missing_[t] else [missing] # parse characters into `charsets` (a dict of word=>siteindex positions), # and `chars` (a list of characters). charsets, chars, previous = defaultdict(list), [], '' for i, (cogid, concept) in enumerate(concepts, 1): char = util.nexus_slug(concept) # add label for ascertainment character in BEAST mode if i == 1 and mode == 'BEAST': chars.append("_ascertainment") # add label for per-word ascertainment characters in BEASTWORDS if mode == 'BEASTWORDS' and previous != concept: chars.append("%s_ascertainment" % char) charsets[char].append(len(chars)) # finally add label. chars.append(char) charsets[char].append(len(chars)) previous = concept # create character labels block if needed if mode in ('BEAST', 'BEASTWORDS'): charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)]) else: charblock = "" # create charsets block blockname, assumptions = None, "" if mode in ('BEASTWORDS', 'MRBAYES'): charsets = [ "\tcharset %s = %d-%d;" % (c, min(m), max(m)) for (c, m) in charsets.items() ] blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES' assumptions = "\n".join(charsets) # commands if commands_name.upper() == blockname and len(assumptions) and commands: # merge commands specified in function call into output blockname assumptions += "\n" + "\n".join("\t%s" % c for c in commands) else: # different commands block set in commands_name. assumptions += block.format(commands_name, '\n'.join(commands)) if commands else '' # convert state matrix to string. _matrix = "" maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1 for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)): _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' _matrix = _matrix.rstrip() # remove trailing # TODO: symbols could be more than "01" but we this function doesn't handle # multistate data so we just specify them here. symbols = '01' text = _template.format( matrix=_matrix, ntax=wordlist.width, nchar=len(matrix[0]), gap=gap, missing=missing, dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD', commands=block.format(blockname, assumptions), custom=block.format(custom_name, '\n'.join(custom)) if custom else '', symbols=symbols, chars=charblock) text = text.replace("\t", " " * 4) # normalise tab-stops for i, (cogid, concept) in enumerate(concepts, 1): text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format( i, cogid, concept) if filename: util.write_text_file(filename, text) return text
def test_scorer2str(test_data): assert scorer2str(rc('dolgo').scorer) == read_text_file(str(test_data / 'dolgo.scorer'))
def test_TextFile(tmppath): path = tmppath / 'test' with util.TextFile(path) as fp: fp.writelines(['line1\n', 'line2\n']) assert len(util.read_text_file(path, lines=True)) == 2
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: warn( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = {k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]]} else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append([x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} if 'id' not in keys: keys['id'] = 'basic' meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def test_scorer2str(self): """ Test conversion of scorers to strings. """ self.assertEqual(scorer2str(lingpy.rc('dolgo').scorer), read_text_file(test_data('dolgo.scorer')))
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def read_qlc(infile, comment='#'): """ Simple function that loads qlc-format into a dictionary. Parameters ---------- infile : str The name of the input file. comment : str (default="#") The comment character. If a line starts with this character, it will be ignored. Returns ------- d : dict A dictionary with integer keys corresponding to the order of the lines of the input file. The header is given 0 as a specific key. """ lines = read_text_file(infile, lines=True, normalize="NFC") data, meta, dtype = [], {}, False while lines: line = lines.pop(0) if line.startswith(comment) or not line: continue if line.startswith('@'): key, value = [s.strip() for s in line[1:].split(':', 1)] if key == 'tree': meta["tree"] = cg.LoadTree(treestring=value) elif key == 'json': for j1, j2 in json.loads(value).items(): meta[j1] = j2 else: if key not in meta: meta[key] = value else: if isinstance(meta[key], list): meta[key].append(value) else: log.warning( "Key '{0}' in input file is not unique! Use JSON-format for " "these datatypes!".format(key)) meta[key] = [meta[key]] + [value] # line starts with complex stuff elif line.startswith('<'): tmp = line[1:line.index('>')] # check for specific keywords if ' ' in tmp: dtype = tmp.split(' ')[0] keys = { k: v[1:-1] for k, v in [key.split('=') for key in tmp.split(' ')[1:]] } else: dtype = tmp.strip() keys = {} tmp = [] while True: line = lines.pop(0) if line.startswith('</' + dtype + '>'): break tmp += [line] tmp = '\n'.join(tmp) # check for data stuff if dtype == "json": tmp = json.loads(tmp) if not keys: for key in tmp: meta[key] = tmp[key] elif keys: meta[keys["id"]] = {} for k in tmp: meta[keys["id"]][k] = tmp[k] elif dtype in ['tre', 'nwk']: if "trees" not in meta: meta["trees"] = {} if not keys: keys["id"] = "1" # XXX consider switching to Tree here XXX meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp) elif dtype in ['csv']: meta[keys["id"]] = {} ncol = int(keys.get('ncol', 2)) if "dtype" in keys: transf = eval(keys["dtype"]) else: transf = str # split tmp into lines tmp = tmp.split('\n') for l in tmp: if ncol == 2: a, b = l.split('\t') b = transf(b) else: l = l.split('\t') a = l[0] b = [transf(b) for b in l[1:]] meta[keys["id"]][a] = b elif dtype == 'msa': tmp = tmp.split('\n') if 'msa' not in meta: meta['msa'] = {} ref = keys.get('ref', 'cogid') if ref not in meta['msa']: meta['msa'][ref] = {} tmp_msa = {} try: tmp_msa['dataset'] = meta['dataset'] except: tmp_msa['dataset'] = infile.replace('.csv', '') tmp_msa['seq_id'] = keys['id'] # add consensus string to msa, if it appears in the keys if "consensus" in keys: tmp_msa['consensus'] = keys['consensus'] msad = [] for l in tmp: if not l.startswith(comment): msad.append( [x.strip().rstrip('.') for x in l.split('\t')]) tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa) try: meta['msa'][ref][int(keys['id'])] = tmp_msa except ValueError: meta['msa'][ref][keys['id']] = tmp_msa elif dtype == 'dst': taxa, matrix = read_dst(tmp) distances = [[0.0 for _ in matrix] for _ in matrix] for i, line in enumerate(matrix): for j, cell in enumerate(line): if i < j: distances[i][j] = cell distances[j][i] = cell meta['distances'] = distances elif dtype == 'scorer': scorer = read_scorer(tmp) if 'scorer' not in meta: meta['scorer'] = {} keys.setdefault('id', 'basic') meta['scorer'][keys['id']] = scorer elif dtype == 'taxa': meta['taxa'] = [t.strip() for t in tmp.split('\n')] else: data += [[l.strip() for l in line.split('\t')]] # create the dictionary in which the data will be stored d = {} # check for first line, if a local ID is given in the header (or simply # "ID"), take this line as the ID, otherwise create it local_id = data[0][0].lower() in ['id', 'local_id', 'localid'] # iterate over data and fill the dictionary (a bit inefficient, but enough # for the moment) try: i = 1 for j, line in enumerate(data[1:]): if local_id: d[int(line[0])] = line[1:] else: d[i] = line i += 1 except ValueError as e: # pragma: no cover raise Exception("Error processing line {0}:\n".format(j) + str(data[1:][j]) + '\nOriginal error message: ' + str(e)) # assign the header to d[0] if local_id: d[0] = [x.lower() for x in data[0][1:]] else: d[0] = [x.lower() for x in data[0]] for m in meta: d[m] = meta[m] if 'trees' in d and 'tree' not in d: d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1] return d
def alm2html(infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = { a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), ) } else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format( d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int( l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1]) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format(shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords) util.write_text_file(filename + '.html', html) return
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
from csvw.metadata import TableGroup from lingpy import util from lingpy.convert.html import template_path # receive the template path from lingpy for splitstree tpath = util.Path(template_path('splitstree.nex')) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json') taxa = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['languages.csv'])} params = {t['ID']: (i, t['Name']) for i, t in enumerate(tbg.tabledict['parameters.csv'])} matrix = [[0 for p in params] for t in taxa] for row in tbg.tabledict['values.csv']: tidx, tname = taxa[row['Language_ID']] pidx, pname = params[row['Parameter_ID']] if row['Value'] == '+': matrix[tidx][pidx] = 1 alpha = 'abcdefghijklmnopqrstuvwxyz' alpha += alpha.upper() alpha += '0123456789' matrix_string = '' tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0]) for i, line in enumerate(matrix): matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if
def test_scorer2str(self): """ Test conversion of scorers to strings. """ self.assertEqual(scorer2str(rc('dolgo').scorer), read_text_file(test_data('dolgo.scorer')))
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def csv2list(filename, fileformat='', dtype=None, comment='#', sep='\t', strip_lines=True, header=False): """ Very simple function to get quick (and somewhat naive) access to CSV-files. Parameters ---------- filename : str Name of the input file. fileformat : {None str} If not specified the file <filename> will be loaded. Otherwise, the fileformat is interpreted as the specific extension of the input file. dtype : {list} If not specified, all data will be loaded as strings. Otherwise, a list specifying the data for each line should be provided. comment : string (default="#") Comment character in the begin of a line forces this line to be ignored (set to None if you want to parse all lines of your file). sep : string (default = "\t") Specify the separator for the CSV-file. strip_lines : bool (default=True) Specify whether empty "cells" in the input file should be preserved. If set to c{False}, each line will be stripped first, and all whitespace will be cleaned. Otherwise, each line will be separated using the specified separator, and no stripping of whitespace will be carried out. header : bool (default=False) Indicate, whether the data comes along with a header. Returns ------- l : list A list-representation of the CSV file. """ # check for correct fileformat if fileformat: infile = filename + '.' + fileformat else: infile = filename if dtype is None: dtype = [] l = [] # open the file infile = read_text_file(infile, lines=True, normalize="NFC") # check for header idx = 0 if header else -1 for i, line in enumerate(infile): if line and (not comment or not line.startswith(comment)) and idx != i: if strip_lines: cells = [c.strip() for c in line.strip().split(sep)] else: cells = [c.strip() for c in line.split(sep)] if not dtype: l += [cells] else: l += [[f(c) for f, c in zip(dtype, cells)]] return l
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file( os.path.join(file_path, name), normalize='NFC').replace('\n', '')
def output(self, dtype, filename=None, labels=None): """ Parameters ---------- dtype : str {"json", "html", "nwk" } Specify the type of the output: * *json*: JSON format, suitable for use in d3. * *nwk*: Newick format (identical with input upon initialization). * *html*: Simple interactive HTML-representation with collapsible nodes. """ if dtype == "json": if filename: with open(filename + "." + dtype, "w") as f: f.write(json.dumps(self._dict, indent=2)) else: return json.dumps(self._dict, indent=2) elif dtype == "html": # make simple label function get_label = lambda x: labels[x] if labels else x start = '<div id="root" class="node-container">root.content</div>' clean_label = lambda x: "".join([y for y in sort_tree(x) if y not in "();"]).replace(",", "_") template = '<div class="node-container"><div id="#node_name:label" class="node-label">#node_label</div><div class="node-content">#node_children:{node}</div></div>' leave = '<div id="#node_leave:label" class="node-leave"><div class="inner_leave">#node_leave</div></div>' txt = ( template.format(node=self.root) .replace("#node_label", get_label(self[self.root]["label"])) .replace("#node_name", clean_label(self.root)) ) # transform function helps to make the transformation with check # for leave or child transform = ( lambda x: template.format(node=x) .replace("#node_label", get_label(self[x]["label"])) .replace("#node_name", clean_label(x)) if not self[x]["leave"] else leave.replace("#node_leave", get_label(x)) ) for i, node in enumerate(self.nodes): # write all children children = self[node]["children"] node_children = "\n".join([transform(child) for child in children]) txt = txt.replace("#node_children:" + node, node_children) # get the templates html = util.read_text_file("lexical_change.html") css = util.read_text_file("lexical_change.css") js = util.read_text_file("lexical_change.js") title = "LingPy Tree Class" html = html.format(STYLE=css, SCRIPT=js, TITLE=title, TREE=txt) filename = filename or "lingpy.basic.newick" util.write_text_file(filename + ".html", html)
def write_nexus(taxa, matrix, custom=None, custom_name='lingpy', missing="?", gap="-", template="mrbayes.nex", filename="mrbayes.nex", dtype="RESTRICTION", symbols="10", commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- taxa : list The taxonomic units in your data. They should be valid taxon names, only consisting of alphanumeric characters and an underscore, usually also not exceeding a length of 15 characters. matrix : list The matrix with the values for each taxon in one separate row. Usually, the matrix contains binary values which can be passed as strings or integers (1 and 0), but missing values are also possible. Given biological common restrictions, each character can only be one ASCII symbol. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. missing : str (default="?") The symbol for missing characters. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). template : str (default="mrbayes.nex") The name of the template file. This file is located in the template/ folder of the LingPy package, but a custom file can be specified by providing the path. dtype : str (default="RESTRICTION") The datatype, which is usually "STANDARD" or "RESTRICTION" in linguistic analyses, with "RESTRICTION" pointing to pure birth-death models. symbols : str (default="10") The symbols used for the characters. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. """ tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: util.read_text_file(template) _commands = 'BEGIN {0};\n{1}\n\n'.format( commands_name, '\n'.join(commands)) if commands else '' _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name, '\n'.join(custom)) if custom else '' _matrix = "" mtl = max([len(t) for t in taxa]) + 1 for i, (t, m) in enumerate(zip(taxa, matrix)): _matrix += str(t + mtl * ' ')[:mtl] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' text = _template.format(matrix=_matrix, ntax=len(taxa), nchar=len(matrix[0]), gap=gap, missing=missing, dtype=dtype, commands=_commands, custom=_custom, symbols=symbols) util.write_text_file(filename, text)
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file(os.path.join(file_path, name), normalize='NFC').replace('\n', '')
def alm2html( infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords ): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = {a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), )} else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d ) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format(d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format( colors[abs(int(l[0]))], tmp, loan_line, l[1] ) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format( shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords ) util.write_text_file(filename + '.html', html) return
def cognate_detection(self, **keywords): """ Method runs a cognate detection analysis. """ kw = dict( align_method='progressive', align_mode=rcParams['align_mode'], align_modes=rcParams['align_modes'], cluster_method=rcParams['lexstat_cluster_method'], cognate_method='sca', cognate_mode='overlap', defaults=False, factor=rcParams['align_factor'], gap_weight=rcParams['gap_weight'], gop=rcParams['align_gop'], iteration=False, lexstat_modes=rcParams['lexstat_modes'], limit=rcParams['lexstat_limit'], merge_vowels=rcParams['merge_vowels'], model=rcParams['sca'], export="html", preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams['lexstat_preprocessing_threshold'], rands=rcParams['lexstat_rands'], ratio=rcParams['lexstat_ratio'], ref="customid", restricted_chars=rcParams['restricted_chars'], restriction='', runs=rcParams['lexstat_runs'], scale=rcParams['align_scale'], scoring_method=rcParams['lexstat_scoring_method'], swap_check=False, threshold=rcParams['lexstat_threshold'], tree_calc=rcParams['align_tree_calc'], vscale=rcParams['lexstat_vscale'], outfile=False, sonar=True, ) # first load kw.update(keywords) if kw['defaults']: return kw # carry out lexstat cluster analysis self.lex = LexStat(self.infile, **kw) # reset filename if it is not defined kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy' # check for traditional lexstat analysis if kw['cognate_method'] == 'lexstat': self.lex.get_scorer( method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw) self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw) # align the data self.alms = Alignments(self.lex, **kw) kw['scoredict'] = self.lex.cscorer \ if kw['cognate_method'] == 'lexstat' else self.lex.bscorer self.alms.align( method=kw['align_method'], mode=kw['align_mode'], modes=kw['align_modes'], **kw) if 'tsv' in kw['export']: self.alms.output( 'tsv', filename=kw['outfile'], ignore=['scorer', 'json', 'taxa', 'msa'], **kw) if 'html' in kw['export']: corrs, occs = get_correspondences(self.alms, kw['ref']) # serialize the wordlist wl = {} for concept in self.alms.concepts: entries = self.alms.get_list(concept=concept, flat=True) cogids = [self.alms[idx, kw['ref']] for idx in entries] words = [self.alms[idx, 'ipa'] for idx in entries] alms = [self.alms[idx, 'alignment'] for idx in entries] langs = [self.alms[idx, 'doculect'] for idx in entries] checkalm = lambda x: x if type(x) == str else ' '.join(x) wl[concept] = [list(k) for k in sorted( zip( langs, [str(x) for x in entries], words, [str(x) for x in cogids], [checkalm(x) for x in alms], ), key=lambda x: int(x[3]))] # make simple gloss id for internal use as id gloss2id = list( zip( self.alms.concepts, [str(x) for x in range(1, len(self.alms.concepts) + 1)])) id2gloss = dict([[b, a] for a, b in gloss2id]) gloss2id = dict(gloss2id) txt = '' txt += 'CORRS = ' + json.dumps(corrs) + ';\n' txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n' txt += 'OCCS = ' + json.dumps(occs) + ';\n' txt += 'WLS = ' + json.dumps(wl) + ';\n' txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n' txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n' txt += 'FILE = "' + kw['outfile'] + '.tsv";\n' tpath = partial(util.data_path, 'templates') tname = 'jcov.{0}.html'.format( 'remote' if 'remote' in kw['export'] else 'direct') content = util.read_text_file(tpath(tname)) util.write_text_file( kw['outfile'] + '.html', content.format( CORRS=txt, JCOV=util.read_text_file(tpath('jcov.js')), STYLE=util.read_text_file(tpath('jcov.css')), VENDOR=util.read_text_file(tpath('jcov.vendor.js')), DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
def write_nexus( wordlist, mode='mrbayes', filename="mrbayes.nex", ref="cogid", missing="?", gap="-", custom=None, custom_name='lingpy', commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. mode : str (default="mrbayes") The name of the output nexus style. Valid values are: * 'MRBAYES': a MrBayes formatted nexus file. * 'SPLITSTREE': a SPLITSTREE formatted nexus file. * 'BEAST': a BEAST formatted nexus file. * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned analyses. * 'TRAITLAB': a TRAITLab formatted nexus. filename : str (default=None) Name of the file to which the nexus file will be written. If set to c{None}, then this function will not write the nexus ontent to a file, but simply return the content as a string. ref: str (default="cogid") Column in which you store the cognate sets in your data. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). missing : str (default="?") The symbol for missing characters. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. Returns ------- nexus : str A string containing nexus file output """ templates = { 'BEAST': 'beast.nex', 'BEASTWORDS': 'beast.nex', 'SPLITSTREE': 'splitstree.nex', 'MRBAYES': 'mrbayes.nex', 'TRAITLAB': 'splitstree.nex', } block = "\n\nBEGIN {0};\n{1}\nEND;\n" # template for nexus blocks # check for valid mode mode = mode.upper() if mode not in templates.keys(): raise ValueError("Unknown output mode %s" % mode) # check for valid template template = templates.get(mode) tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) # check that `ref` is a valid column if ref not in wordlist._alias: raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref) # retrieve the matrix matrix = [[] for x in range(wordlist.width)] etd = wordlist.get_etymdict(ref=ref) concepts = sorted([(cogid, wordlist[[ x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in etd.items()], key=lambda x: (x[1], x[0])) # and missing data.. missing_ = {t: [concept for (cogid, concept) in concepts if concept not in wordlist.get_list( col=t, entry=wordlist._row_name, flat=True)] for t in wordlist.cols} # add ascertainment character for mode=BEAST if mode == 'BEAST': matrix = [['0'] for m in matrix] # skip the constant sites for traitlab if mode == 'TRAITLAB': concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])] # fill matrix for i, t in enumerate(wordlist.cols): previous = '' for cogid, concept in concepts: if previous != concept: previous = concept # add ascertainment character for mode=BEASTWORDS. Note that if # a given word:language is missing, then its ascertainment # character is the `missing` character. if mode == "BEASTWORDS": matrix[i] += ['0'] if concept not in missing_[t] else [missing] matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \ missing_[t] else [missing] # parse characters into `charsets` (a dict of word=>siteindex positions), # and `chars` (a list of characters). charsets, chars, previous = defaultdict(list), [], '' for i, (cogid, concept) in enumerate(concepts, 1): char = util.nexus_slug(concept) # add label for ascertainment character in BEAST mode if i == 1 and mode == 'BEAST': chars.append("_ascertainment") # add label for per-word ascertainment characters in BEASTWORDS if mode == 'BEASTWORDS' and previous != concept: chars.append("%s_ascertainment" % char) charsets[char].append(len(chars)) # finally add label. chars.append(char) charsets[char].append(len(chars)) previous = concept # create character labels block if needed if mode in ('BEAST', 'BEASTWORDS'): charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)]) else: charblock = "" # create charsets block blockname, assumptions = None, "" if mode in ('BEASTWORDS', 'MRBAYES'): charsets = ["\tcharset %s = %d-%d;" % ( c, min(m), max(m)) for (c, m) in charsets.items() ] blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES' assumptions = "\n".join(charsets) # commands if commands_name.upper() == blockname and len(assumptions) and commands: # merge commands specified in function call into output blockname assumptions += "\n" + "\n".join("\t%s" % c for c in commands) else: # different commands block set in commands_name. assumptions += block.format(commands_name, '\n'.join(commands)) if commands else '' # convert state matrix to string. _matrix = "" maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1 for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)): _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' ' _matrix += ''.join([ '({0})'.format(c) if len(c) > 1 else str(c) for c in m ]) + '\n' _matrix = _matrix.rstrip() # remove trailing # TODO: symbols could be more than "01" but we this function doesn't handle # multistate data so we just specify them here. symbols = '01' text = _template.format( matrix=_matrix, ntax=wordlist.width, nchar=len(matrix[0]), gap=gap, missing=missing, dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD', commands=block.format(blockname, assumptions), custom=block.format(custom_name, '\n'.join(custom)) if custom else '', symbols=symbols, chars=charblock ) text = text.replace("\t", " " * 4) # normalise tab-stops for i, (cogid, concept) in enumerate(concepts, 1): text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(i, cogid, concept) if filename: util.write_text_file(filename, text) return text