def test_reduce_msa(self): msa = MSA(read_msa(test_data('test_reduce.msa'))) reduced_alignment = reduce_alignment(msa.alignment) for i, line in enumerate(reduced_alignment): assert len(line) == 4 and \ ''.join(line) == ''.join( msa.alignment[i])[:msa.alignment[i].index('(')]
def test_reduce_msa(test_data): msa = MSA(read_msa(str(test_data / 'test_reduce.msa'))) reduced_alignment = reduce_alignment(msa.alignment) for i, line in enumerate(reduced_alignment): assert len(line) == 4 and \ ''.join(line) == ''.join( msa.alignment[i])[:msa.alignment[i].index('(')]
def test_msa2str(self): aranger = '{body}{meta}' # read msa traditionally into an object msa_a = MSA(test_data('harry.msa')) # read msa from dictionary msa_b = qlc.read_msa(test_data('harry.msa')) # read msa with IDs msa_c = qlc.read_msa(test_data('harry_with_ids.msa'), ids=True, header=False) # we adjust the dataset and the seq_id since otherwise we won't have # similar output msa_c['seq_id'] = 'test' msa_c['dataset'] = 'file' # when converting these different objects to string with the same body # and the like, they should be identical, so we check this here str_a = msa2str(msa_a, _arange=aranger) str_b = msa2str(msa_b, _arange=aranger) str_c = msa2str(msa_c, _arange=aranger, wordlist=False) assert str_a == str_b == str_c # we next test for converting with the merging attribute str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True) str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False) # remove tabstops for checking similar strings str_d_st = str_d.replace('\t', '') str_e_st = str_e.replace('\t', '') # get index until 'COLUMN' idx = str_d_st.index('COLUMNID') assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx] # add a consensus string to all msa objects consensus_a = get_consensus(MSA(msa_b), gaps=True) consensus_b = get_consensus(MSA(msa_c), gaps=True) msa_b['consensus'] = consensus_a msa_c['consensus'] = consensus_b assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def test_normalize_alignment(test_data): msa = MSA(read_msa(str(test_data / 'harry_unnormal.msa'))) for line in msa.alignment[1:]: assert len(line) == len(msa.alignment[0])
def test_read_msa(test_data): msa = MSA(read_msa(str(test_data / 'harry.msa'))) assert hasattr(msa, 'seqs')
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def test_normalize_alignment(self): msa = MSA(read_msa(test_data('harry_unnormal.msa'))) for line in msa.alignment[1:]: assert len(line) == len(msa.alignment[0])
def test_read_msa(self): msa = MSA(read_msa(test_data('harry.msa'))) assert hasattr(msa, 'seqs')