def evaluate_string(self, string, tokens=False, **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) if not tokens: tokens = ipa2tokens(string) score = 1 dist = self.dist['#'] prostring = prosodic_string(tokens, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) if self.classes: c = tokens2class(tokens, self.model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) teststring = list(zip(prostring, c)) else: teststring = list(zip(prostring, tokens)) scores = [] while len(teststring) > 0: segment = teststring.pop(0) freq = dist.count(segment) allf = len(dist) s = freq / allf score = score * s scores += [s] dist = self.dist[segment] score = score * s scores += [s] lscore = np.log10(score) lscore = lscore / len(tokens) return score, lscore # np.log10(score)
def __init__(self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: print(w, tk) tt = tokens2class(tk, rcParams['art']) print(tt) p = prosodic_string(tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def __init__( self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords ): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: tt = tokens2class(tk, rcParams['art']) p = prosodic_string( tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def check_tokens(tokens, **keywords): """ Function checks whether tokens are given in a consistent input format. """ setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) errors = [] for i, token in enumerate(tokens): # check for conversion within the articulation-model cls = token2class(token, rcParams['art'], stress=keywords['stress'], cldf=keywords['cldf'], diacritics=keywords['diacritics']) if cls == '0': errors.append((i, token)) return errors
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append( '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'. format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def check_tokens(tokens, **keywords): """ Function checks whether tokens are given in a consistent input format. """ setdefaults(keywords, stress=rcParams['stress']) errors = [] for i, token in enumerate(tokens): # check for conversion within the articulation-model try: rcParams['art'].converter[token] except KeyError: try: rcParams['art'].converter[token[0]] except KeyError: if token[0] in keywords['stress']: try: rcParams['art'].converter[token[1]] except KeyError: errors.append((i, token)) else: errors.append((i, token)) return errors
def align(self, **keywords): """ Align a pair of sequences or multiple sequence pairs. Parameters ---------- gop : int (default=-1) The gap opening penalty (GOP). scale : float (default=0.5) The gap extension penalty (GEP), calculated with help of a scaling factor. mode : {"global","local","overlap","dialign"} The alignment mode, see :evobib:`List2012a` for details. factor : float (default = 0.3) The factor by which matches in identical prosodic position are increased. restricted_chars : str (default="T\_") The restricted chars that function as an indicator of syllable or morpheme breaks for secondary alignment, see :evobib:`List2012c` for details. distance : bool (default=False) If set to *True*, return the distance instead of the similarity score. Distance is calculated using the formula by :evobib:`Downey2008`. model : { None, ~lingpy.data.model.Model } Specify the sound class model that shall be used for the analysis. If no model is specified, the default model of :evobib:`List2012a` will be used. pprint : bool (default=False) If set to *True*, the alignments are printed to the screen. """ setdefaults( keywords, gop=-1, scale=0.5, mode='global', factor=0.3, restricted_chars='T_', distance=False, model=rcParams['sca'], pprint=False, transform=rcParams['align_transform']) if hasattr(self, 'model'): if keywords['model'] != self.model: self._set_model(**keywords) else: self._set_model(**keywords) # create the alignments array self._alignments = calign.align_pairs( self.classes, self.weights, self.prostrings, keywords['gop'], keywords['scale'], keywords['factor'], self.scoredict, keywords['mode'], keywords['restricted_chars'], distance=1 if keywords['distance'] else 0) # switch back to alignments self.alignments = [] for i, (almA, almB, sim) in enumerate(self._alignments): self.alignments.append(( class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"), class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"), sim)) # print the alignments, if this is chosen as_string(self, pprint=keywords['pprint'])
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def alm2html(infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = { a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), ) } else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format( d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int( l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1]) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format(shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords) util.write_text_file(filename + '.html', html) return
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def calculate_data( wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults( keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst( wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warn( "Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warn( "Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))
def _export( self, fileformat, sections=None, entries=None, entry_sep='', item_sep='', template='', exclude=None, entry_start='', entry_close='', **keywords): """ Export a wordlist to various file formats. """ if not sections: if fileformat == 'txt': sections = dict( h1=('concept', '\n# Concept: {0}\n'), h2=('cogid', '## Cognate-ID: {0}\n')) elif fileformat == 'tex': sections = dict( h1=('concept', r'\section{{Concept: ``{0}"}}' + '\n'), h2=('cogid', r'\subsection{{Cognate Set: ``{0}"}}' + '\n')) elif fileformat == 'html': sections = dict( h1=('concept', '<h1>Concept: {0}</h1>'), h2=('cogid', '<h2>Cognate Set: {0}</h2>')) if not entries: if fileformat == 'txt': entries = [('language', '{0} '), ('ipa', '{0}\n')] elif fileformat == 'tex': entries = [('language', '{0} '), ('ipa', '[{0}]' + '\n')] elif fileformat == 'html': entries = [('language', '{0} '), ('ipa', '[{0}]\n')] util.setdefaults(keywords, filename=rcParams['filename']) # get the temporary dictionary out = wl2dict(self, sections, entries, exclude) # assign the output string out_string = '' # iterate over the dictionary and start to fill the string for key in sorted(out, key=lambda x: str(x).lower()): # write key to file out_string += key[1] # reassign tmp tmp = out[key] # set the pointer and the index pointer = {0: [tmp, sorted(tmp.keys())]} while True: idx = max(pointer.keys()) # check for type of current point if isinstance(tmp, dict): if pointer[idx][1]: next_key = pointer[idx][1].pop() out_string += next_key[1] tmp = pointer[idx][0][next_key] if isinstance(tmp, dict): pointer[idx + 1] = [tmp, sorted(tmp.keys())] else: pointer[idx + 1] = [tmp, tmp] else: del pointer[idx] if idx == 0: break else: tmp_strings = [] for line in sorted(tmp): tmp_strings += [item_sep.join(line)] out_string += entry_start + entry_sep.join(tmp_strings) + entry_close tmp = pointer[idx - 1][0] del pointer[idx] if fileformat == 'tex': out_string = out_string.replace('_', r'\_') tmpl = util.read_text_file(template) if template else '{0}' _write_file(keywords['filename'], tmpl.format(out_string), fileformat)
def _output(self, fileformat, **keywords): """ Internal function that eases its modification by daughter classes. """ # check for stamp attribute keywords["stamp"] = getattr(self, '_stamp', '') # add the default parameters, they will be checked against the keywords util.setdefaults( keywords, cols=False, distances=False, entries=("concept", "counterpart"), entry='concept', fileformat=fileformat, filename=rcParams['filename'], formatter='concept', modify_ref=False, meta=self._meta, missing=0, prettify='false', ignore='all', ref='cogid', rows=False, subset=False, # setup a subset of the data, taxa='taxa', threshold=0.6, # threshold for flat clustering tree_calc='neighbor') if fileformat in ['triple', 'triples', 'triples.tsv']: return tsv2triple(self, keywords['filename'] + '.' + fileformat) if fileformat in ['paps.nex', 'paps.csv']: paps = self.get_paps( ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing']) kw = dict(filename=keywords['filename'] + '.paps') if fileformat == 'paps.nex': kw['missing'] = keywords['missing'] return pap2nex(self.cols, paps, **kw) return pap2csv(self.cols, paps, **kw) # simple printing of taxa if fileformat == 'taxa': assert hasattr(self, 'taxa') return util.write_text_file(keywords['filename'] + '.taxa', self.cols) # csv-output if fileformat in ['csv', 'qlc', 'tsv']: # get the header line header = sorted( [s for s in set(self._alias.values()) if s in self._header], key=lambda x: self._header[x]) header = [h.upper() for h in header] self._meta.setdefault('taxa', self.cols) # get the data, in case a subset is chosen if not keywords['subset']: # write stuff to file return wl2qlc(header, self._data, **keywords) cols, rows = keywords['cols'], keywords['rows'] if not isinstance(cols, (list, tuple, bool)): raise ValueError("[i] Argument 'cols' should be list or tuple.") if not isinstance(rows, (dict, bool)): raise ValueError("[i] Argument 'rows' should be a dictionary.") # check for chosen header if cols: # get indices for header indices = [self._header[x] for x in cols] header = [c.upper() for c in cols] else: indices = [r for r in range(len(self.header))] if rows: stmts = [] for key, value in rows.items(): if key == 'ID': stmts += ["key " + value] else: idx = self._header[key] stmts += ["line[{0}] ".format(idx) + value] log.debug("calculated what should be excluded") # get the data out = {} for key, line in self._data.items(): log.debug(key) if rows: if eval(" and ".join(stmts)): out[key] = [line[i] for i in indices] else: out[key] = [line[i] for i in indices] log.debug("passing data to wl2qlc") return wl2qlc(header, out, **keywords) # output dst-format (phylip) if fileformat == 'dst': # check for distances as keyword if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self, **keywords) out = matrix2dst(self._meta['distances'], self.taxa, stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0)) return _write_file(keywords['filename'], out, fileformat) # output tre-format (newick) if fileformat in ['tre', 'nwk']: # ,'cluster','groups']: if 'tree' not in self._meta: # check for distances if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # we look up a function to calculate a tree in the cluster module: tree = getattr(cluster, keywords['tree_calc'])( self._meta['distances'], self.cols, distances=keywords['distances']) else: tree = self._meta['tree'] return _write_file(keywords['filename'], '{0}'.format(tree), fileformat) if fileformat in ['cluster', 'groups']: if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # check for keywords if 'groups' not in self._meta: self._meta['groups'] = cluster.matrix2groups( keywords['threshold'], self._meta['distances'], self.taxa) lines = [] for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]): lines.append('{0}\t{1}'.format(taxon, group)) return _write_file(keywords['filename'], lines, fileformat) if fileformat in ['starling', 'star.csv']: # make lambda inline for data-check l = lambda x: ['-' if x == 0 else x][0] lines = [] if 'cognates' not in keywords: lines.append('ID\tConcept\t' + '\t'.join(self.taxa)) for i, concept in enumerate(self.concepts): for line in self.get_list(row=concept, entry=keywords['entry']): lines.append( str(i + 1) + '\t' + concept + '\t' + '\t'.join( [l(t) for t in line])) else: lines.append( 'ID\tConcept\t' + '\t'.join( ['{0}\t COG'.format(t) for t in self.taxa])) for i, concept in enumerate(self.concepts): cogs = self.get_list(row=concept, entry=keywords['cognates']) for j, line in enumerate( self.get_list(row=concept, entry=keywords['entry'])): part = '\t'.join( '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j])) lines.append(util.tabjoin(i + 1, concept, part)) return _write_file( keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv') if fileformat == 'multistate.nex': if not keywords['filename'].endswith('.multistate.nex'): keywords['filename'] += '.multistate.nex' matrix = wl2multistate(self, keywords['ref'], keywords['missing']) return multistate2nex(self.taxa, matrix, keywords['filename']) if fileformat == 'separated': if not os.path.isdir(keywords['filename']): os.mkdir(keywords['filename']) for l in self.cols: lines = [''] if 'ignore_keys' in keywords else ['ID\t'] lines[0] += '\t'.join(x.upper() for x in keywords['entries']) for key in self.get_list(col=l, flat=True): line = [] if 'ignore_keys' in keywords else [key] for entry in keywords['entries']: tmp = self[key, entry] if isinstance(tmp, list): tmp = ' '.join([str(x) for x in tmp]) line += [tmp] lines.append('\t'.join('{0}'.format(x) for x in line)) _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def alm2html( infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords ): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = {a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), )} else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d ) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format(d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format( colors[abs(int(l[0]))], tmp, loan_line, l[1] ) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format( shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords ) util.write_text_file(filename + '.html', html) return
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def _list2msa(msa_lines, ids=False, header=True, normalize=False, **keywords): """ Function retrieves a dictionary from a list of MSA strings. """ setdefaults(keywords, seq_id='-', dataset='-', input_file='dummy') d = dict(ID=[], taxa=[], alignment=[], seqs=[], infile=keywords['input_file']) if header: start = 2 d['dataset'] = msa_lines[0] d['seq_id'] = msa_lines[1] else: start = 0 d['dataset'] = keywords['dataset'] d['seq_id'] = keywords['seq_id'] for i, line in enumerate(msa_lines[start:]): idx = 1 if ids else 0 # check for specific id if line[0] in ['0', 'LOCAL', 'CROSSED', 'SWAPS', 'MERGE', 'COMPLEX', ]: if line[idx] == 'LOCAL': d['local'] = [] for j, x in enumerate(line[idx + 1:]): if x == '*': d['local'] += [j] elif line[idx] in ['CROSSED', 'SWAPS']: d['swaps'] = [] swapline = [x for x in line[idx + 1:]] j = 0 while swapline: x = swapline.pop(0) if x == '+': d['swaps'] += [(j, j + 1, j + 2)] swapline.pop(0) swapline.pop(0) j += 2 else: pass j += 1 elif line[idx] in ['COMPLEX', 'MERGE']: d['merge'] = {} mergeline = [x for x in line[idx + 1:]] k = 0 merge = False for j, m in enumerate(mergeline): if m == '<': merge = True if m == '>': merge = False d['merge'][j] = k if not merge: k += 1 else: d[line[idx].lower()] = line[idx + 1:] elif line[0] not in ['LOCAL', 'SWAPS', 'MERGE', 'COMPLEX', '0']: if ids: try: d['ID'] += [int(line[0])] except ValueError: d['ID'] += [line[0]] else: d["ID"] += [i] d["taxa"] += [line[idx].rstrip('.')] d["seqs"] += [' '.join([l for l in line[idx + 1:] if l != '-'])] d["alignment"] += [line[idx + 1:]] # normalize the alignment if the option is chosen if normalize: d['alignment'] = normalize_alignment(d['alignment']) return d
def wl2qlc(header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults(keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json' ] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (str, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: (data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += str(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([str(v) for v in value]) elif type(value) == int: out += '\t' + str(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file(filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def wl2qlc( header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults( keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (text_type, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: ( data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += text_type(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([text_type(v) for v in value]) elif type(value) == int: out += '\t' + text_type(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file( filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def _list2msa(msa_lines, ids=False, header=True, normalize=False, **keywords): """ Function retrieves a dictionary from a list of MSA strings. """ setdefaults(keywords, seq_id='-', dataset='-', input_file='dummy') d = dict(ID=[], taxa=[], alignment=[], seqs=[], infile=keywords['input_file']) if header: start = 2 d['dataset'] = msa_lines[0] d['seq_id'] = msa_lines[1] else: start = 0 d['dataset'] = keywords['dataset'] d['seq_id'] = keywords['seq_id'] for i, line in enumerate(msa_lines[start:]): idx = 1 if ids else 0 # check for specific id if line[0] in [ '0', 'LOCAL', 'CROSSED', 'SWAPS', 'MERGE', 'COMPLEX', ]: if line[idx] == 'LOCAL': d['local'] = [] for j, x in enumerate(line[idx + 1:]): if x == '*': d['local'] += [j] elif line[idx] in ['CROSSED', 'SWAPS']: d['swaps'] = [] swapline = [x for x in line[idx + 1:]] j = 0 while swapline: x = swapline.pop(0) if x == '+': d['swaps'] += [(j, j + 1, j + 2)] swapline.pop(0) swapline.pop(0) j += 2 else: pass j += 1 elif line[idx] in ['COMPLEX', 'MERGE']: d['merge'] = {} mergeline = [x for x in line[idx + 1:]] k = 0 merge = False for j, m in enumerate(mergeline): if m == '<': merge = True if m == '>': merge = False d['merge'][j] = k if not merge: k += 1 else: d[line[idx].lower()] = line[idx + 1:] elif line[0] not in ['LOCAL', 'SWAPS', 'MERGE', 'COMPLEX', '0']: if ids: try: d['ID'] += [int(line[0])] except ValueError: d['ID'] += [line[0]] else: d["ID"] += [i] d["taxa"] += [line[idx].rstrip('.')] d["seqs"] += [' '.join([l for l in line[idx + 1:] if l != '-'])] d["alignment"] += [line[idx + 1:]] # normalize the alignment if the option is chosen if normalize: d['alignment'] = normalize_alignment(d['alignment']) return d
def calculate_data(wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults(keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warning("Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warning("Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))
def align(self, **keywords): """ Align a pair of sequences or multiple sequence pairs. Parameters ---------- gop : int (default=-1) The gap opening penalty (GOP). scale : float (default=0.5) The gap extension penalty (GEP), calculated with help of a scaling factor. mode : {"global","local","overlap","dialign"} The alignment mode, see :evobib:`List2012a` for details. factor : float (default = 0.3) The factor by which matches in identical prosodic position are increased. restricted_chars : str (default="T_") The restricted chars that function as an indicator of syllable or morpheme breaks for secondary alignment, see :evobib:`List2012c` for details. distance : bool (default=False) If set to *True*, return the distance instead of the similarity score. Distance is calculated using the formula by :evobib:`Downey2008`. model : { None, ~lingpy.data.model.Model } Specify the sound class model that shall be used for the analysis. If no model is specified, the default model of :evobib:`List2012a` will be used. pprint : bool (default=False) If set to *True*, the alignments are printed to the screen. """ setdefaults( keywords, gop=-1, scale=0.5, mode='global', factor=0.3, restricted_chars='T_', distance=False, model=rcParams['sca'], pprint=False, transform=rcParams['align_transform']) if hasattr(self, 'model'): if keywords['model'] != self.model: self._set_model(**keywords) else: self._set_model(**keywords) # create the alignments array self._alignments = calign.align_pairs( self.classes, self.weights, self.prostrings, keywords['gop'], keywords['scale'], keywords['factor'], self.scoredict, keywords['mode'], keywords['restricted_chars'], distance=1 if keywords['distance'] else 0) # switch back to alignments self.alignments = [] for i, (almA, almB, sim) in enumerate(self._alignments): self.alignments.append(( class2tokens(self.tokens[i][0], almA, local=keywords['mode'] == "local"), class2tokens(self.tokens[i][1], almB, local=keywords['mode'] == "local"), sim)) # print the alignments, if this is chosen as_string(self, pprint=keywords['pprint'])