def test_pid(self): assert pid('mattis', 'maTTIs', 1) == 0.5 assert pid('mattis', 'maTTIs', 2) == 0.5 assert pid('mattis', 'maTTIs', 3) == 0.5 assert pid('mattis', 'maTTIs', 4) == 0.5 assert pid('m-', '-m', mode=1) == 0 assert pid('m-', '-m', mode=2) == 0 assert pid('m', '-', mode=3) == 0 assert pid('m-', '-m', mode=4) == 0
def test_pid(): assert pid('mattis', 'maTTIs') == 0.5
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html