def tsv2triple(wordlist, outfile=None): """ Function converts a wordlist to a triple data structure. Notes ----- The basic values of which the triples consist are: * ID (the ID in the TSV file) * COLUMN (the column in the TSV file) * VALUE (the entry in the TSV file) """ tstore = [] for head in wordlist.header: log.debug('tsv2triple: ' + head) for key in wordlist: tstore.append((key, head.upper(), wordlist[key, head])) if outfile: out = '' for a, b, c in tstore: if isinstance(c, list): c = ' '.join([text_type(x) for x in c]) if c != '-': out += '{0}\t{1}\t{2}\n'.format(a, b, c) util.write_text_file(outfile, out, normalize='NFC') return tstore
def tsv2triple(wordlist, outfile=None): """ Function converts a wordlist to a triple data structure. Notes ----- The basic values of which the triples consist are: * ID (the ID in the TSV file) * COLUMN (the column in the TSV file) * VALUE (the entry in the TSV file) """ tstore = [] for head in wordlist.header: log.debug('tsv2triple: ' + head) for key in wordlist: tstore.append((key, head.upper(), wordlist[key, head])) if outfile: out = '' for a, b, c in tstore: if isinstance(c, list): c = ' '.join([str(x) for x in c]) if c != '-': out += '{0}\t{1}\t{2}\n'.format(a, b, c) util.write_text_file(outfile, out, normalize='NFC') return tstore
def pap2nex(taxa, paps, missing=0, filename='', datatype='STANDARD'): """ Function converts a list of paps into nexus file format. Parameters ---------- taxa : list List of taxa. paps : {list, dict} A two-dimensional list with the first dimension being identical to the number of taxa and the second dimension being identical to the number of paps. If a dictionary is passed, each key represents a given pap. The following two structures will thus be treated identically:: >>> paps = [[1,0],[1,0],[1,0]] # two languages, three paps >>> paps = {1:[1,0], 2:[1,0], 3:[1,0]} # two languages, three paps missing : {str, int} (default=0) Indicate how missing characters are represented in the original data. """ out = '#NEXUS\n\nBEGIN DATA;\nDIMENSIONS ntax={0} NCHAR={1};\n' out += "FORMAT DATATYPE={5} GAP=- MISSING={2} interleave=yes;\n" out += "MATRIX\n\n{3}\n;\n\nEND;\n" out += "[PAPS-REFERENCE]\n{4}" # get longest taxon maxTax = max([len(taxon) for taxon in taxa]) paps_ref = "" # check whether paps are dict or list if hasattr(paps, 'keys'): new_paps = [paps[k] for k in sorted(paps)] reference = [k for k in sorted(paps)] else: new_paps = paps reference = [k for k in range(1, len(paps) + 1)] # create reference ref_string = '' for i, ref in enumerate(reference): ref_string += '[{0} :: {1}]\n'.format(i, ref) # create the matrix matrix = "" for i, taxon in enumerate(taxa): tmp = '{0:XXX} ' matrix += tmp.replace('XXX', str(maxTax)).format(taxon) matrix += ''.join([str(itm[i]) for itm in new_paps]) matrix += '\n' if not filename: return out.format(len(taxa), len(paps), missing, matrix, ref_string, datatype) util.write_text_file( filename + '.nex', out.format(len(taxa), len(paps), missing, matrix, ref_string, datatype)) return
def multistate2nex(taxa, matrix, filename='', missing="?"): """ Convert the data in a given wordlist to NEXUS-format for multistate analyses in PAUP. Parameters ---------- taxa : list The list of taxa that shall be written to file. matrix : list The multi-state matrix with the first dimension indicating the taxa, and the second their states. filename : str (default="") If not specified, the filename of the Wordlist will be taken, otherwise, it specifies the name of the file to which the data will be written. """ # set up the nexus template nexus = """#NEXUS BEGIN DATA; DIMENSIONS ntax={ntax} NCHAR={nchar}; FORMAT RESPECTCASE DATATYPE=STANDARD symbols="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP0123456789" GAP=? MISSING={missing} interleave=yes; OPTIONS MSTAXA = POLYMORPH; MATRIX {matrix} END; """ # calculate maximal length of taxon strings tlen = max([len(t) for t in taxa]) # calculate the matrix-text in the nexus template matrix_text = "" for taxon, line in zip(taxa, matrix): ntaxon = taxon + tlen * ' ' + ' ' ntaxon = ntaxon[:tlen] matrix_text += "{0} {1}\n".format(ntaxon, ''.join(line)) if filename: util.write_text_file( filename, nexus.format( ntax=len(taxa), nchar=len(matrix[0]), matrix=matrix_text, missing=missing ) ) else: raise ValueError("[!] A wrong filename was specified!") return
def _export_score_dict(score_dict): """ Function exports a scoring dictionary to a csv-file. @todo: This function can be better ported to another file. """ letters = list(set([key[0] for key in score_dict.keys()])) rows = [['+'] + letters] for l1 in letters: rows.append([l1] + [str(score_dict[(l1, l2)]) for l2 in letters]) util.write_text_file('score_dict.csv', '\n'.join('\t'.join(row) for row in rows))
def check_stats(models, wordlist, filename='results.txt', pprint=False): results = [] for m in models: p, z = tstats(wordlist, m, return_dists=True) results += [[m, p, z]] txt = '' for a, b, c in results: txt += '{0}\t{1:.2f}\t{2:.2f}\n'.format(a, b, c) as_string(txt, pprint) if filename: write_text_file(filename, txt)
def test_output(self): fpsa = self.tmp_path('test.psa') write_text_file(fpsa, '\n') psa = PSA(text_type(fpsa)) fname = text_type(self.tmp_path('test')) psa.output(fileformat='psa', filename=fname) psq = self.tmp_path('test.psq') write_text_file(psq, '\n') psa = PSA(text_type(psq)) fname = text_type(self.tmp_path('test')) psa.output(fileformat='psq', filename=fname)
def multistate2nex(taxa, matrix, filename='', missing="?"): """ Convert the data in a given wordlist to NEXUS-format for multistate analyses in PAUP. Parameters ---------- taxa : list The list of taxa that shall be written to file. matrix : list The multi-state matrix with the first dimension indicating the taxa, and the second their states. filename : str (default="") If not specified, the filename of the Wordlist will be taken, otherwise, it specifies the name of the file to which the data will be written. """ # set up the nexus template nexus = """#NEXUS BEGIN DATA; DIMENSIONS ntax={ntax} NCHAR={nchar}; FORMAT RESPECTCASE DATATYPE=STANDARD symbols="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP0123456789" GAP=? MISSING={missing} interleave=yes; OPTIONS MSTAXA = POLYMORPH; MATRIX {matrix} END; """ # calculate maximal length of taxon strings tlen = max([len(t) for t in taxa]) # calculate the matrix-text in the nexus template matrix_text = "" for taxon, line in zip(taxa, matrix): ntaxon = taxon + tlen * ' ' + ' ' ntaxon = ntaxon[:tlen] matrix_text += "{0} {1}\n".format(ntaxon, ''.join(line)) if filename: util.write_text_file( filename, nexus.format(ntax=len(taxa), nchar=len(matrix[0]), matrix=matrix_text, missing=missing)) else: raise ValueError("[!] A wrong filename was specified!") return
def pap2csv(taxa, paps, filename=''): """ Write paps created by the Wordlist class to a csv-file. """ out = "ID\t" + '\t'.join(taxa) + '\n' for key in sorted(paps): out += '{0}\t{1}\n'.format(key, '\t'.join(str(i) for i in paps[key])) if not filename: return out util.write_text_file(filename + '.csv', out) return
def test_output(self): fpsa = self.tmp_path('test.psa') write_text_file(fpsa, '\n') psa = PSA(text_type(fpsa)) fname = text_type(self.tmp_path('test')) psa.output(fileformat='psa', filename=fname) psq = self.tmp_path('test.psq') write_text_file(psq, '\n') psa = PSA(text_type(psq)) fname = text_type(self.tmp_path('test')) psa.output(fileformat='psq', filename=fname) psa = PSA(text_type(test_data('harry_potter.psa'))) psa.align() psa.output(fileformat="psa", filename=fname, scores=True) psa.output(fileformat="psq", filename=fname)
def test_output(tmppath, test_data): fpsa = tmppath / 'test.psa' write_text_file(fpsa, '\n') psa = PSA(str(fpsa)) fname = str(tmppath / 'test') psa.output(fileformat='psa', filename=fname) psq = tmppath / 'test.psq' write_text_file(psq, '\n') psa = PSA(str(psq)) fname = str(tmppath / 'test') psa.output(fileformat='psq', filename=fname) psa = PSA(str(test_data / 'harry_potter.psa')) psa.align() psa.output(fileformat="psa", filename=fname, scores=True) psa.output(fileformat="psq", filename=fname)
def matrix2tree(matrix, taxa, tree_calc="neighbor", distances=True, filename=""): """ Calculate a tree of a given distance matrix. Parameters ---------- matrix : list The distance matrix to be used. taxa : list A list of the taxa in the distance matrix. tree_calc : str (default="neighbor") The method for tree calculation that shall be used. Select between: * "neighbor": Neighbor-joining method (:evobib:`Saitou1987`) * "upgma" : UPGMA method (:evobib:`Sokal1958`) distances : bool (default=True) If set to c{True}, distances will be included in the tree-representation. filename : str (default='') If a filename is specified, the data will be written to that file. Returns ------- tree : ~lingpy.thirdparty.cogent.tree.PhyloNode A ~lingpy.thirdparty.cogent.tree.PhyloNode object for handling tree files. """ if tree_calc == 'upgma': algorithm = cluster.upgma elif tree_calc == 'neighbor': algorithm = cluster.neighbor else: raise ValueError(tree_calc) tree = cg.LoadTree(treestring=algorithm(matrix, taxa, distances)) if not filename: return tree util.write_text_file(filename + '.nwk', text_type(tree))
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append( '{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'. format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def pap2csv( taxa, paps, filename='' ): """ Write paps created by the Wordlist class to a csv-file. """ out = "ID\t" + '\t'.join(taxa) + '\n' for key in sorted(paps, key=lambda x: int(re.sub(r'[^0-9]+', '', str(x)))): out += '{0}\t{1}\n'.format( key, '\t'.join(str(i) for i in paps[key]) ) if not filename: return out util.write_text_file(filename + '.csv', out) return
def diff(self, **keywords): """ Write all differences between two sets to a file. Parameters ---------- filename : str (default='eval_psa_diff') Default """ setdefaults(keywords, filename=self.gold.infile) if not keywords['filename'].endswith('.diff'): keywords['filename'] = keywords['filename'] + '.diff' out = [] for i, (a, b) in enumerate(zip(self.gold.alignments, self.test.alignments)): g1, g2, g3 = a t1, t2, t3 = b maxL = max([len(g1), len(t1)]) if g1 != t1 or g2 != t2: taxA, taxB = self.gold.taxa[i] taxlen = max(len(taxA), len(taxB)) seq_id = self.gold.seq_ids[i] out.append('{0}\n{1}\t{2}\n{3}\t{4}\n{5}\n{1}\t{6}\n{3}\t{7}\n\n'.format( seq_id, taxA, '\t'.join(g1), taxB, '\t'.join(g2), '{0}\t{1}'.format( taxlen * ' ', '\t'.join(['==' for x in range(maxL)])), '\t'.join(t1), '\t'.join(t2), )) log.file_written(keywords['filename']) write_text_file(keywords['filename'], out)
def test_write_text_file(tmppath): def lines_generator(n): for i in range(n): yield 'line%s' % i path = tmppath / 'test' util.write_text_file(path, 'test') assert util.read_text_file(path) == 'test' util.write_text_file(path, ['line1', 'line2']) assert len(util.read_text_file(path, lines=True)) == 2 util.write_text_file(path, lines_generator(5)) assert len(util.read_text_file(path, lines=True)) == 5
def test_write_text_file(self): def lines_generator(n): for i in range(n): yield 'line%s' % i path = self.tmp_path('test') util.write_text_file(path, 'test') self.assertEqual(util.read_text_file(path), 'test') util.write_text_file(path, ['line1', 'line2']) self.assertEqual(len(util.read_text_file(path, lines=True)), 2) util.write_text_file(path, lines_generator(5)) self.assertEqual(len(util.read_text_file(path, lines=True)), 5)
def cognate_detection(self, **keywords): """ Method runs a cognate detection analysis. """ kw = dict( align_method='progressive', align_mode=rcParams['align_mode'], align_modes=rcParams['align_modes'], cluster_method=rcParams['lexstat_cluster_method'], cognate_method='sca', cognate_mode='overlap', defaults=False, factor=rcParams['align_factor'], gap_weight=rcParams['gap_weight'], gop=rcParams['align_gop'], iteration=False, lexstat_modes=rcParams['lexstat_modes'], limit=rcParams['lexstat_limit'], merge_vowels=rcParams['merge_vowels'], model=rcParams['sca'], export="html", preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], rands=rcParams['lexstat_rands'], ratio=rcParams['lexstat_ratio'], ref="customid", restricted_chars=rcParams['restricted_chars'], restriction='', runs=rcParams['lexstat_runs'], scale=rcParams['align_scale'], scoring_method=rcParams['lexstat_scoring_method'], swap_check=False, threshold=rcParams['lexstat_threshold'], tree_calc=rcParams['align_tree_calc'], vscale=rcParams['lexstat_vscale'], outfile=False, sonar=True, ) # first load kw.update(keywords) if kw['defaults']: return kw # carry out lexstat cluster analysis self.lex = LexStat(self.infile, **kw) # reset filename if it is not defined kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy' # check for traditional lexstat analysis if kw['cognate_method'] == 'lexstat': self.lex.get_scorer(method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw) self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw) # align the data self.alms = Alignments(self.lex, **kw) kw['scoredict'] = self.lex.cscorer \ if kw['cognate_method'] == 'lexstat' else self.lex.bscorer self.alms.align(method=kw['align_method'], mode=kw['align_mode'], modes=kw['align_modes'], **kw) if 'tsv' in kw['export']: self.alms.output('tsv', filename=kw['outfile'], ignore=['scorer', 'json', 'taxa', 'msa'], **kw) if 'html' in kw['export']: corrs, occs = get_correspondences(self.alms, kw['ref']) # serialize the wordlist wl = {} for concept in self.alms.concepts: entries = self.alms.get_list(concept=concept, flat=True) cogids = [self.alms[idx, kw['ref']] for idx in entries] words = [self.alms[idx, 'ipa'] for idx in entries] alms = [self.alms[idx, 'alignment'] for idx in entries] langs = [self.alms[idx, 'doculect'] for idx in entries] checkalm = lambda x: x if type(x) == str else ' '.join(x) wl[concept] = [ list(k) for k in sorted(zip( langs, [str(x) for x in entries], words, [str(x) for x in cogids], [checkalm(x) for x in alms], ), key=lambda x: int(x[3])) ] # make simple gloss id for internal use as id gloss2id = list( zip(self.alms.concepts, [ str(x) for x in range(1, len(self.alms.concepts) + 1) ])) id2gloss = dict([[b, a] for a, b in gloss2id]) gloss2id = dict(gloss2id) txt = '' txt += 'CORRS = ' + json.dumps(corrs) + ';\n' txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n' txt += 'OCCS = ' + json.dumps(occs) + ';\n' txt += 'WLS = ' + json.dumps(wl) + ';\n' txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n' txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n' txt += 'FILE = "' + kw['outfile'] + '.tsv";\n' tpath = partial(util.data_path, 'templates') tname = 'jcov.{0}.html'.format('remote' if 'remote' in kw['export'] else 'direct') content = util.read_text_file(tpath(tname)) util.write_text_file( kw['outfile'] + '.html', content.format( CORRS=txt, JCOV=util.read_text_file(tpath('jcov.js')), STYLE=util.read_text_file(tpath('jcov.css')), VENDOR=util.read_text_file(tpath('jcov.vendor.js')), DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
def compare_conceptlists( list1, list2, output='', match=None, filename='matches', debug=False, **keywords): """ Function compares two concept lists and outputs suggestions for mapping. Notes ----- Idea is to take one conceptlist as the basic list and then to search for a plausible mapping of concepts in the second list to the first list. All suggestions can then be output in various forms, both with multiple matches excluded or included, and in textform or in other forms. What is important, regarding the output here, is, that the output contains all matches, including non-matched items which occur **in the second list but not in the first list**. Non-matched items which occur in the first list but not in the second list are ignored. The syntax for matching types is organized as follows: * 1 indicates a full match between glosses, including information on part speech and the like * 2 indicates a very good match between a full gloss and the main part of a gloss or the two main parts of a gloss * 3 indicates a very good match between the main parts of two glosses with non-matching information regarding part of speech * 4 indicates that the longest part of two glosses matches along with the part-of-speech information. * 5 indicates that the longest part of two glosses matches with non-matching part-of-speech information. * 6 indicates that the longest part of the first list is matched by one of the parts in the second list * 7 indicates that the longest part of the second list is matched by one of the parts in the first list * 8 indicates that no match could be found. """ # check for match quality if not match: match = [1, 2, 3, 4, 5] # check for keywords defaults = dict( id_name='CONCEPTICON_ID', gloss_name='CONCEPTICON_GLOSS', match_quality='MATCH_QUALITY', gloss='GLOSS', number='NUMBER') defaults.update(keywords) # take first list as basic list base = csv2list(list1) comp = csv2list(list2) # get headers baseh, base = base[0], base[1:] comph, comp = comp[0], comp[1:] # make sure to raise if 'gloss' is not in the headers if (not defaults["gloss"] in baseh and not defaults["gloss"] in comph) or \ (not defaults["number"] in baseh and not defaults["number"] in comph): raise ValueError( "[!] There is no field for '{0}' or '{1}'".format( keywords['gloss'], keywords['number'] ) + " in the header of the input lists.") # get gloss indices bidx = baseh.index(defaults['gloss']) cidx = comph.index(defaults['gloss']) bnum = baseh.index(defaults['number']) cnum = comph.index(defaults['number']) # extract glossing information from the data B = {} idx = 1 for i, line in enumerate(base): gloss = line[bidx] gdata = parse_gloss(gloss, output='dict') for gdatum in gdata: gdatum['number'] = line[bnum] # we won't need "enumerate" XXX B[idx] = gdatum idx += 1 idx = 1 line2idx = {} C = {} for i, line in enumerate(comp): gloss = line[cidx] gdata = parse_gloss(gloss, output='dict') for gdatum in gdata: gdatum['number'] = line[cnum] # we won't need "enumerate" XXX C[idx] = gdatum try: line2idx[i] += [idx] except KeyError: line2idx[i] = [idx] idx += 1 # now that we have prepared all the glossed list as planned, we compare # them item by item and check for similarity sims = [] for i, a in sorted(B.items()): for j, b in sorted(C.items()): # first-order-match: identical glosses if a['gloss'] == b['gloss']: sims += [(i, j, 1)] # second-order match: identical main-parts elif a['main'] == b['gloss'] or a['gloss'] == b['main'] or \ a['main'] == b['main']: # best match if pos matches if a['pos'] == b['pos']: sims += [(i, j, 2)] # less good match if pos mismatches else: sims += [(i, j, 3)] elif a['longest_part'] == b['longest_part']: if a['pos'] == b['pos'] and a['pos']: sims += [(i, j, 4)] else: sims += [(i, j, 5)] elif b['longest_part'] in a['parts']: sims += [(i, j, 6)] elif a['longest_part'] in b['parts']: sims += [(i, j, 7)] # get the number of items which were not matched in the second list matched = [x[1] for x in sims if x[2] in match] not_matched = [idx_ for idx_ in C if idx_ not in matched] for idx in not_matched: sims += [(0, idx, 8)] # sort the matches, add them to a dictionary best = {} for a, b, c in sims: try: best[b] += [(a, c)] except KeyError: best[b] = [(a, c)] for k, v in best.items(): best[k] = sorted(set(v), key=lambda x: x[1]) if best[k][0][1] in matched: best[k] = [best[k][0]] # prepare the output out = [] for b in best: # in sims: for a, c in best[b]: if c in match: out += [(c, B[a]['gloss'], B[a]['number'], C[b]['gloss'], C[b]['number'])] elif c == 0: out += [(c, '?', '0', C[b]['gloss'], C[b]['number'])] if not output: return out elif output == 'tsv': added = [] txt = ['\t'.join(comph) + '\t{0}\t{1}\t{2}\n'.format( defaults['id_name'], defaults['gloss_name'], defaults['match_quality'])] for i, line in enumerate(comp): for idx in line2idx[i]: if idx in best: data = best[idx] else: data = [('?', '0')] for a, b in data: if b in match or b == 8: try: base_gloss = B[a]['gloss'] base_num = B[a]['number'] except KeyError: base_gloss = '???' base_num = '0' nline = '\t'.join(line) + '\t' + str(base_num) + '\t' + \ base_gloss + '\t' + str(b) + '\n' if nline not in added: txt += [nline] added += [nline] else: nline = '\t'.join(line) + '\t???\t???\t8\n' if nline not in added: txt += [nline] added += [nline] txt[-1] += '\n' out = [txt[0]] + sorted(txt[1:], key=lambda x: x[x.index('\t')]) write_text_file(filename, ''.join(out)) if debug: return sims
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults(kw, template=False, css=False, comment='#', filename=infile[:-4] + '.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append(('.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']))) alignments.append( ([str(a) for a in almA], [str(b) for b in almB], 0)) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)
def diff(wordlist, gold='cogid', test='lexstatid', modify_ref=False, pprint=True, filename='', tofile=True, transcription="ipa", concepts=False): r""" Write differences in classifications on an item-basis to file. lex : :py:class:`lingpy.compare.lexstat.LexStat` The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the computation. It should have two columns indicating cognate IDs. gold : str (default='cogid') The name of the column containing the gold standard cognate assignments. test : str (default='lexstatid') The name of the column containing the automatically implemented cognate assignments. modify_ref : function (default=False) Use a function to modify the reference. If your cognate identifiers are numerical, for example, and negative values are assigned as loans, but you want to suppress this behaviour, just set this keyword to "abs", and all cognate IDs will be converted to their absolute value. pprint : bool (default=True) Print out the results filename : str (default='') Name of the output file. If not specified, it is identical with the name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the extension ``diff``. tofile : bool (default=True) If set to c{False}, no data will be written to file, but instead, the data will be returned. transcription : str (default="ipa") The file in which the transcriptions are located (should be a string, no segmentized version, for convenience of writing to file). Returns ------- t : tuple A nested tuple consisting of two further tuples. The first containing precision, recall, and harmonic mean (F-scores), the second containing the same values for the pair-scores. Notes ----- If the **tofile** option is chosen, the results are written to a specific file with the extension ``diff``. This file contains all cognate sets in which there are differences between gold standard and test sets. It also gives detailed information regarding false positives, false negatives, and the words involved in these wrong decisions. See also -------- bcubes pairs """ filename = filename or wordlist.filename loan = modify_ref if modify_ref else identity # open file lines = [] # concepts, allow to check scores for only one concept concepts = concepts or [c for c in wordlist.rows] # get a formatter for language names lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}' preT, recT = [], [] preB, recB = [], [] preP, recP = [], [] def get_pairs(cogs, idxs): tmp = defaultdict(list) for x, y in zip(cogs, idxs): tmp[x].append(y) for x in tmp: for yA, yB in combinations(tmp[x], r=2): yield tuple(sorted([yA, yB])) for concept in concepts: idxs = wordlist.get_list(row=concept, flat=True) # get the basic index for all seqs bidx = [i + 1 for i in range(len(idxs))] cogsG = _get_cogs(gold, concept, loan, wordlist) cogsT = _get_cogs(test, concept, loan, wordlist) if cogsG != cogsT: # calculate the transformation distance of the sets tramGT = len(set(zip(cogsG, cogsT))) tramG = len(set(cogsG)) tramT = len(set(cogsT)) preT += [tramT / tramGT] recT += [tramG / tramGT] # calculate the bcubed precision for the sets preB += [_get_bcubed_score(cogsT, cogsG)] # calculate b-cubed recall recB += [_get_bcubed_score(cogsG, cogsT)] # calculate pair precision pairsG = set(get_pairs(cogsG, idxs)) pairsT = set(get_pairs(cogsT, idxs)) preP.append( len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0) recP.append( len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0) fp = "no" if preP[-1] == 1.0 else "yes" fn = "no" if recP[-1] == 1.0 else "yes" lines.append( "Concept: {0}, False Positives: {1}, False Negatives: {2}". format(concept, fp, fn)) # get the words words = [wordlist[i, 'ipa'] for i in idxs] langs = [wordlist[i, 'taxa'] for i in idxs] # get a word-formater wform = '{0:' + str(max([len(w) for w in words])) + '}' # write differences to file for word, lang, cG, cT in sorted(zip(words, langs, cogsG, cogsT), key=lambda x: (x[2], x[3])): lines.append('{0}\t{1}\t{2:4}\t{3:4}'.format( lform.format(lang), wform.format(word), cG, cT)) lines.append('#') else: preT += [1.0] recT += [1.0] preB += [1.0] recB += [1.0] preP += [1.0] recP += [1.0] bp = sum(preB) / len(preB) br = sum(recB) / len(recB) bf = 2 * (bp * br) / (bp + br) pp = sum(preP) / len(preP) pr = sum(recP) / len(recP) pf = 2 * (pp * pr) / (pp + pr) as_string('\n'.join(lines), pprint=pprint) if tofile: write_text_file(filename + '.diff', lines) return (bp, br, bf), (pp, pr, pf)
def compare_conceptlists( list1, list2, output='', match=None, filename='matches', **keywords): """ Function compares two concept lists and outputs suggestions for mapping. Notes ----- Idea is to take one conceptlist as the basic list and then to search for a plausible mapping of concepts in the second list to the first list. All suggestions can then be output in various forms, both with multiple matches excluded or included, and in textform or in other forms. What is important, regarding the output here, is, that the output contains all matches, including non-matched items which occur **in the second list but not in the first list**. Non-matched items which occur in the first list but not in the second list are ignored. The syntax for matching types is organized as follows: * 1 indicates a full match between glosses, including information on part speech and the like * 2 indicates a very good match between a full gloss and the main part of a gloss or the two main parts of a gloss * 3 indicates a very good match between the main parts of two glosses with non-matching information regarding part of speech * 4 indicates that the longest part of two glosses matches along with the part-of-speech information. * 5 indicates that the longest part of two glosses matches with non-matching part-of-speech information. * 6 indicates that the longest part of the first list is matched by one of the parts in the second list * 7 indicates that the longest part of the second list is matched by one of the parts in the first list * 8 indicates that no match could be found. """ # check for match quality if not match: match = [1, 2, 3, 4, 5] # check for keywords defaults = dict( id_name='CONCEPTICON_ID', gloss_name='CONCEPTICON_GLOSS', match_quality='MATCH_QUALITY', gloss='GLOSS', number='NUMBER') defaults.update(keywords) # take first list as basic list base = csv2list(list1) comp = csv2list(list2) # get headers baseh, base = base[0], base[1:] comph, comp = comp[0], comp[1:] # make sure to raise if 'gloss' is not in the headers if (not defaults["gloss"] in baseh and not defaults["gloss"] in comph) or \ (not defaults["number"] in baseh and not defaults["number"] in comph): raise ValueError( "[!] There is no field for '{0}' or '{1}'".format( keywords['gloss'], keywords['number'] ) + " in the header of the input lists.") # get gloss indices bidx = baseh.index(defaults['gloss']) cidx = comph.index(defaults['gloss']) bnum = baseh.index(defaults['number']) cnum = comph.index(defaults['number']) # extract glossing information from the data B = {} idx = 1 for i, line in enumerate(base): gloss = line[bidx] gdata = parse_gloss(gloss, output='dict') for gdatum in gdata: gdatum['number'] = line[bnum] # we won't need "enumerate" XXX B[idx] = gdatum idx += 1 idx = 1 line2idx = {} C = {} for i, line in enumerate(comp): gloss = line[cidx] gdata = parse_gloss(gloss, output='dict') for gdatum in gdata: gdatum['number'] = line[cnum] # we won't need "enumerate" XXX C[idx] = gdatum try: line2idx[i] += [idx] except KeyError: line2idx[i] = [idx] idx += 1 # now that we have prepared all the glossed list as planned, we compare # them item by item and check for similarity sims = [] for i, a in sorted(B.items()): for j, b in sorted(C.items()): # first-order-match: identical glosses if a['gloss'] == b['gloss']: sims += [(i, j, 1)] # second-order match: identical main-parts elif a['main'] == b['gloss'] or a['gloss'] == b['main'] or \ a['main'] == b['main']: # best match if pos matches if a['pos'] == b['pos']: sims += [(i, j, 2)] # less good match if pos mismatches else: sims += [(i, j, 3)] elif a['longest_part'] == b['longest_part']: if a['pos'] == b['pos'] and a['pos']: sims += [(i, j, 4)] else: sims += [(i, j, 5)] elif b['longest_part'] in a['parts']: sims += [(i, j, 6)] elif a['longest_part'] in b['parts']: sims += [(i, j, 7)] # get the number of items which were not matched in the second list matched = [x[1] for x in sims if x[2] in match] not_matched = [idx_ for idx_ in C if idx_ not in matched] for idx in not_matched: sims += [(0, idx, 8)] # sort the matches, add them to a dictionary best = {} for a, b, c in sims: try: best[b] += [(a, c)] except KeyError: best[b] = [(a, c)] for k, v in best.items(): best[k] = sorted(set(v), key=lambda x: x[1]) if best[k][0][1] in matched: best[k] = [best[k][0]] # prepare the output out = [] for b in best: # in sims: for a, c in best[b]: if c in match: out += [(c, B[a]['gloss'], B[a]['number'], C[b]['gloss'], C[b]['number'])] elif c == 0: out += [(c, '?', '0', C[b]['gloss'], C[b]['number'])] if not output: return out elif output == 'tsv': added = [] txt = ['\t'.join(comph) + '\t{0}\t{1}\t{2}\n'.format( defaults['id_name'], defaults['gloss_name'], defaults['match_quality'])] for i, line in enumerate(comp): for idx in line2idx[i]: if idx in best: data = best[idx] else: data = [('?', '0')] for a, b in data: if b in match or b == 8: try: base_gloss = B[a]['gloss'] base_num = B[a]['number'] except KeyError: base_gloss = '???' base_num = '0' nline = '\t'.join(line) + '\t' + str(base_num) + '\t' + \ base_gloss + '\t' + str(b) + '\n' if nline not in added: txt += [nline] added += [nline] else: nline = '\t'.join(line) + '\t???\t???\t8\n' if nline not in added: txt += [nline] added += [nline] txt[-1] += '\n' out = [txt[0]] + sorted(txt[1:], key=lambda x: x[x.index('\t')]) write_text_file(filename, ''.join(out))
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
def _graph_or_file(graph, filename): if filename: util.write_text_file(filename + '.gml', nx.generate_gml(graph)) return return graph
def _output(self, fileformat, **keywords): """ Internal function that eases its modification by daughter classes. """ # check for stamp attribute keywords["stamp"] = getattr(self, '_stamp', '') # add the default parameters, they will be checked against the keywords util.setdefaults( keywords, cols=False, distances=False, entries=("concept", "counterpart"), entry='concept', fileformat=fileformat, filename=rcParams['filename'], formatter='concept', modify_ref=False, meta=self._meta, missing=0, prettify='false', ignore='all', ref='cogid', rows=False, subset=False, # setup a subset of the data, taxa='taxa', threshold=0.6, # threshold for flat clustering tree_calc='neighbor') if fileformat in ['triple', 'triples', 'triples.tsv']: return tsv2triple(self, keywords['filename'] + '.' + fileformat) if fileformat in ['paps.nex', 'paps.csv']: paps = self.get_paps( ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing']) kw = dict(filename=keywords['filename'] + '.paps') if fileformat == 'paps.nex': kw['missing'] = keywords['missing'] return pap2nex(self.cols, paps, **kw) return pap2csv(self.cols, paps, **kw) # simple printing of taxa if fileformat == 'taxa': assert hasattr(self, 'taxa') return util.write_text_file(keywords['filename'] + '.taxa', self.cols) # csv-output if fileformat in ['csv', 'qlc', 'tsv']: # get the header line header = sorted( [s for s in set(self._alias.values()) if s in self._header], key=lambda x: self._header[x]) header = [h.upper() for h in header] self._meta.setdefault('taxa', self.cols) # get the data, in case a subset is chosen if not keywords['subset']: # write stuff to file return wl2qlc(header, self._data, **keywords) cols, rows = keywords['cols'], keywords['rows'] if not isinstance(cols, (list, tuple, bool)): raise ValueError("[i] Argument 'cols' should be list or tuple.") if not isinstance(rows, (dict, bool)): raise ValueError("[i] Argument 'rows' should be a dictionary.") # check for chosen header if cols: # get indices for header indices = [self._header[x] for x in cols] header = [c.upper() for c in cols] else: indices = [r for r in range(len(self.header))] if rows: stmts = [] for key, value in rows.items(): if key == 'ID': stmts += ["key " + value] else: idx = self._header[key] stmts += ["line[{0}] ".format(idx) + value] log.debug("calculated what should be excluded") # get the data out = {} for key, line in self._data.items(): log.debug(key) if rows: if eval(" and ".join(stmts)): out[key] = [line[i] for i in indices] else: out[key] = [line[i] for i in indices] log.debug("passing data to wl2qlc") return wl2qlc(header, out, **keywords) # output dst-format (phylip) if fileformat == 'dst': # check for distances as keyword if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self, **keywords) out = matrix2dst(self._meta['distances'], self.taxa, stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0)) return _write_file(keywords['filename'], out, fileformat) # output tre-format (newick) if fileformat in ['tre', 'nwk']: # ,'cluster','groups']: if 'tree' not in self._meta: # check for distances if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # we look up a function to calculate a tree in the cluster module: tree = getattr(cluster, keywords['tree_calc'])( self._meta['distances'], self.cols, distances=keywords['distances']) else: tree = self._meta['tree'] return _write_file(keywords['filename'], '{0}'.format(tree), fileformat) if fileformat in ['cluster', 'groups']: if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # check for keywords if 'groups' not in self._meta: self._meta['groups'] = cluster.matrix2groups( keywords['threshold'], self._meta['distances'], self.taxa) lines = [] for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]): lines.append('{0}\t{1}'.format(taxon, group)) return _write_file(keywords['filename'], lines, fileformat) if fileformat in ['starling', 'star.csv']: # make lambda inline for data-check l = lambda x: ['-' if x == 0 else x][0] lines = [] if 'cognates' not in keywords: lines.append('ID\tConcept\t' + '\t'.join(self.taxa)) for i, concept in enumerate(self.concepts): for line in self.get_list(row=concept, entry=keywords['entry']): lines.append( str(i + 1) + '\t' + concept + '\t' + '\t'.join( [l(t) for t in line])) else: lines.append( 'ID\tConcept\t' + '\t'.join( ['{0}\t COG'.format(t) for t in self.taxa])) for i, concept in enumerate(self.concepts): cogs = self.get_list(row=concept, entry=keywords['cognates']) for j, line in enumerate( self.get_list(row=concept, entry=keywords['entry'])): part = '\t'.join( '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j])) lines.append(util.tabjoin(i + 1, concept, part)) return _write_file( keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv') if fileformat == 'multistate.nex': if not keywords['filename'].endswith('.multistate.nex'): keywords['filename'] += '.multistate.nex' matrix = wl2multistate(self, keywords['ref'], keywords['missing']) return multistate2nex(self.taxa, matrix, keywords['filename']) if fileformat == 'separated': if not os.path.isdir(keywords['filename']): os.mkdir(keywords['filename']) for l in self.cols: lines = [''] if 'ignore_keys' in keywords else ['ID\t'] lines[0] += '\t'.join(x.upper() for x in keywords['entries']) for key in self.get_list(col=l, flat=True): line = [] if 'ignore_keys' in keywords else [key] for entry in keywords['entries']: tmp = self[key, entry] if isinstance(tmp, list): tmp = ' '.join([str(x) for x in tmp]) line += [tmp] lines.append('\t'.join('{0}'.format(x) for x in line)) _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
def write_nexus(wordlist, mode='mrbayes', filename="mrbayes.nex", ref="cogid", missing="?", gap="-", custom=None, custom_name='lingpy', commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. mode : str (default="mrbayes") The name of the output nexus style. Valid values are: * 'MRBAYES': a MrBayes formatted nexus file. * 'SPLITSTREE': a SPLITSTREE formatted nexus file. * 'BEAST': a BEAST formatted nexus file. * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned analyses. * 'TRAITLAB': a TRAITLab formatted nexus. filename : str (default=None) Name of the file to which the nexus file will be written. If set to c{None}, then this function will not write the nexus ontent to a file, but simply return the content as a string. ref: str (default="cogid") Column in which you store the cognate sets in your data. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). missing : str (default="?") The symbol for missing characters. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. Returns ------- nexus : str A string containing nexus file output """ templates = { 'BEAST': 'beast.nex', 'BEASTWORDS': 'beast.nex', 'SPLITSTREE': 'splitstree.nex', 'MRBAYES': 'mrbayes.nex', 'TRAITLAB': 'splitstree.nex', } block = "\n\nBEGIN {0};\n{1}\nEND;\n" # template for nexus blocks # check for valid mode mode = mode.upper() if mode not in templates.keys(): raise ValueError("Unknown output mode %s" % mode) # check for valid template template = templates.get(mode) tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) # check that `ref` is a valid column if ref not in wordlist._alias: raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref) # retrieve the matrix matrix = [[] for x in range(wordlist.width)] etd = wordlist.get_etymdict(ref=ref) concepts = sorted( [(cogid, wordlist[[x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in etd.items()], key=lambda x: (x[1], x[0])) # and missing data.. missing_ = { t: [ concept for (cogid, concept) in concepts if concept not in wordlist.get_list(col=t, entry=wordlist._row_name, flat=True) ] for t in wordlist.cols } # add ascertainment character for mode=BEAST if mode == 'BEAST': matrix = [['0'] for m in matrix] # skip the constant sites for traitlab if mode == 'TRAITLAB': concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])] # fill matrix for i, t in enumerate(wordlist.cols): previous = '' for cogid, concept in concepts: if previous != concept: previous = concept # add ascertainment character for mode=BEASTWORDS. Note that if # a given word:language is missing, then its ascertainment # character is the `missing` character. if mode == "BEASTWORDS": matrix[i] += ['0'] if concept not in missing_[t] else [ missing ] matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \ missing_[t] else [missing] # parse characters into `charsets` (a dict of word=>siteindex positions), # and `chars` (a list of characters). charsets, chars, previous = defaultdict(list), [], '' for i, (cogid, concept) in enumerate(concepts, 1): char = util.nexus_slug(concept) # add label for ascertainment character in BEAST mode if i == 1 and mode == 'BEAST': chars.append("_ascertainment") # add label for per-word ascertainment characters in BEASTWORDS if mode == 'BEASTWORDS' and previous != concept: chars.append("%s_ascertainment" % char) charsets[char].append(len(chars)) # finally add label. chars.append(char) charsets[char].append(len(chars)) previous = concept # create character labels block if needed if mode in ('BEAST', 'BEASTWORDS'): charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)]) else: charblock = "" # create charsets block blockname, assumptions = None, "" if mode in ('BEASTWORDS', 'MRBAYES'): charsets = [ "\tcharset %s = %d-%d;" % (c, min(m), max(m)) for (c, m) in charsets.items() ] blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES' assumptions = "\n".join(charsets) # commands if commands_name.upper() == blockname and len(assumptions) and commands: # merge commands specified in function call into output blockname assumptions += "\n" + "\n".join("\t%s" % c for c in commands) else: # different commands block set in commands_name. assumptions += block.format(commands_name, '\n'.join(commands)) if commands else '' # convert state matrix to string. _matrix = "" maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1 for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)): _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' _matrix = _matrix.rstrip() # remove trailing # TODO: symbols could be more than "01" but we this function doesn't handle # multistate data so we just specify them here. symbols = '01' text = _template.format( matrix=_matrix, ntax=wordlist.width, nchar=len(matrix[0]), gap=gap, missing=missing, dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD', commands=block.format(blockname, assumptions), custom=block.format(custom_name, '\n'.join(custom)) if custom else '', symbols=symbols, chars=charblock) text = text.replace("\t", " " * 4) # normalise tab-stops for i, (cogid, concept) in enumerate(concepts, 1): text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format( i, cogid, concept) if filename: util.write_text_file(filename, text) return text
def matrix2dst( matrix, taxa=None, stamp='', filename='', taxlen=10, comment='#' ): """ Convert matrix to dst-format. Parameters ---------- taxa : {None, list} List of taxon names corresponding to the distances. Make sure that you only use alphanumeric characters and the understroke for assigning the taxon names. Especially avoid the usage of brackets, since this will confuse many phylogenetic programs. stamp : str (default='') Convenience stamp passed as a comment that can be used to indicate how the matrix was created. filename : str If you specify a filename, the data will be written to file. taxlen : int (default=10) Indicate how long the taxon names are allowed to be. The Phylip package only allows taxon names consisting of maximally 10 characters. Other packages, however, allow more. If Phylip compatibility is not important for you and you just want to allow for as long taxon names as possible, set this value to 0. comment : str (default = '#') The comment character to be used when adding additional information in the "stamp". Returns ------- output : {str or file} Depending on your settings, this function returns a string in DST (=Phylip) format, or a file containing the string. """ if not taxa: taxa = ['t_{0}'.format(i + 1) for i in range(len(matrix))] out = ' {0}\n'.format(len(taxa)) for i, taxon in enumerate(taxa): # check for zero-taxlen if taxlen == 0: dummy = '{0}\t' idx = len(taxon) joinchar = '\t' # normally in Phylip this is a space else: dummy = '{0:' + str(taxlen) + '}' idx = taxlen + 1 joinchar = ' ' out += dummy.format(taxon)[:idx] + joinchar out += joinchar.join(['{0:.2f}'.format(d) for d in matrix[i]]) out += '\n' if stamp: out += '{1} {0}'.format(stamp, comment) if not filename: return out else: util.write_text_file(filename + '.dst', out)
def write_nexus( wordlist, mode='mrbayes', filename="mrbayes.nex", ref="cogid", missing="?", gap="-", custom=None, custom_name='lingpy', commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- wordlist : lingpy.basic.wordlist.Wordlist A Wordlist object containing cognate IDs. mode : str (default="mrbayes") The name of the output nexus style. Valid values are: * 'MRBAYES': a MrBayes formatted nexus file. * 'SPLITSTREE': a SPLITSTREE formatted nexus file. * 'BEAST': a BEAST formatted nexus file. * 'BEASTWORDS': a BEAST formatted nexus for word-partitioned analyses. * 'TRAITLAB': a TRAITLab formatted nexus. filename : str (default=None) Name of the file to which the nexus file will be written. If set to c{None}, then this function will not write the nexus ontent to a file, but simply return the content as a string. ref: str (default="cogid") Column in which you store the cognate sets in your data. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). missing : str (default="?") The symbol for missing characters. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. Returns ------- nexus : str A string containing nexus file output """ templates = { 'BEAST': 'beast.nex', 'BEASTWORDS': 'beast.nex', 'SPLITSTREE': 'splitstree.nex', 'MRBAYES': 'mrbayes.nex', 'TRAITLAB': 'splitstree.nex', } block = "\n\nBEGIN {0};\n{1}\nEND;\n" # template for nexus blocks # check for valid mode mode = mode.upper() if mode not in templates.keys(): raise ValueError("Unknown output mode %s" % mode) # check for valid template template = templates.get(mode) tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: # pragma: no cover raise IOError("Unknown template %s" % template) # check that `ref` is a valid column if ref not in wordlist._alias: raise KeyError("Unknown _ref_ column in wordlist '%s'" % ref) # retrieve the matrix matrix = [[] for x in range(wordlist.width)] etd = wordlist.get_etymdict(ref=ref) concepts = sorted([(cogid, wordlist[[ x[0] for x in vals if x][0]][wordlist._rowIdx]) for (cogid, vals) in etd.items()], key=lambda x: (x[1], x[0])) # and missing data.. missing_ = {t: [concept for (cogid, concept) in concepts if concept not in wordlist.get_list( col=t, entry=wordlist._row_name, flat=True)] for t in wordlist.cols} # add ascertainment character for mode=BEAST if mode == 'BEAST': matrix = [['0'] for m in matrix] # skip the constant sites for traitlab if mode == 'TRAITLAB': concepts = [(i, c) for (i, c) in concepts if not _is_constant(etd[i])] # fill matrix for i, t in enumerate(wordlist.cols): previous = '' for cogid, concept in concepts: if previous != concept: previous = concept # add ascertainment character for mode=BEASTWORDS. Note that if # a given word:language is missing, then its ascertainment # character is the `missing` character. if mode == "BEASTWORDS": matrix[i] += ['0'] if concept not in missing_[t] else [missing] matrix[i] += ['1'] if etd[cogid][i] else ['0'] if concept not in \ missing_[t] else [missing] # parse characters into `charsets` (a dict of word=>siteindex positions), # and `chars` (a list of characters). charsets, chars, previous = defaultdict(list), [], '' for i, (cogid, concept) in enumerate(concepts, 1): char = util.nexus_slug(concept) # add label for ascertainment character in BEAST mode if i == 1 and mode == 'BEAST': chars.append("_ascertainment") # add label for per-word ascertainment characters in BEASTWORDS if mode == 'BEASTWORDS' and previous != concept: chars.append("%s_ascertainment" % char) charsets[char].append(len(chars)) # finally add label. chars.append(char) charsets[char].append(len(chars)) previous = concept # create character labels block if needed if mode in ('BEAST', 'BEASTWORDS'): charblock = ",\n".join(["\t%d %s" % o for o in enumerate(chars, 1)]) else: charblock = "" # create charsets block blockname, assumptions = None, "" if mode in ('BEASTWORDS', 'MRBAYES'): charsets = ["\tcharset %s = %d-%d;" % ( c, min(m), max(m)) for (c, m) in charsets.items() ] blockname = 'ASSUMPTIONS' if mode == 'BEASTWORDS' else 'MRBAYES' assumptions = "\n".join(charsets) # commands if commands_name.upper() == blockname and len(assumptions) and commands: # merge commands specified in function call into output blockname assumptions += "\n" + "\n".join("\t%s" % c for c in commands) else: # different commands block set in commands_name. assumptions += block.format(commands_name, '\n'.join(commands)) if commands else '' # convert state matrix to string. _matrix = "" maxtaxlen = max([len(util.nexus_slug(t)) for t in wordlist.cols]) + 1 for i, (taxon, m) in enumerate(zip(wordlist.cols, matrix)): _matrix += str(util.nexus_slug(taxon) + maxtaxlen * ' ')[:maxtaxlen] + ' ' _matrix += ''.join([ '({0})'.format(c) if len(c) > 1 else str(c) for c in m ]) + '\n' _matrix = _matrix.rstrip() # remove trailing # TODO: symbols could be more than "01" but we this function doesn't handle # multistate data so we just specify them here. symbols = '01' text = _template.format( matrix=_matrix, ntax=wordlist.width, nchar=len(matrix[0]), gap=gap, missing=missing, dtype='RESTRICTION' if mode == 'MRBAYES' else 'STANDARD', commands=block.format(blockname, assumptions), custom=block.format(custom_name, '\n'.join(custom)) if custom else '', symbols=symbols, chars=charblock ) text = text.replace("\t", " " * 4) # normalise tab-stops for i, (cogid, concept) in enumerate(concepts, 1): text += '\n[MATRIX:{0}=COGID:{1}=CONCEPT:{2}]'.format(i, cogid, concept) if filename: util.write_text_file(filename, text) return text
def diff( wordlist, gold='cogid', test='lexstatid', modify_ref=False, pprint=True, filename='', tofile=True, transcription="ipa"): r""" Write differences in classifications on an item-basis to file. lex : :py:class:`lingpy.compare.lexstat.LexStat` The :py:class:`~lingpy.compare.lexstat.LexStat` class used for the computation. It should have two columns indicating cognate IDs. gold : str (default='cogid') The name of the column containing the gold standard cognate assignments. test : str (default='lexstatid') The name of the column containing the automatically implemented cognate assignments. modify_ref : function (default=False) Use a function to modify the reference. If your cognate identifiers are numerical, for example, and negative values are assigned as loans, but you want to suppress this behaviour, just set this keyword to "abs", and all cognate IDs will be converted to their absolute value. pprint : bool (default=True) Print out the results filename : str (default='') Name of the output file. If not specified, it is identical with the name of the :py:class:`~lingpy.compare.lexstat.LexStat`, but with the extension ``diff``. tofile : bool (default=True) If set to c{False}, no data will be written to file, but instead, the data will be returned. transcription : str (default="ipa") The file in which the transcriptions are located (should be a string, no segmentized version, for convenience of writing to file). Returns ------- t : tuple A nested tuple consisting of two further tuples. The first containing precision, recall, and harmonic mean (F-scores), the second containing the same values for the pair-scores. Notes ----- If the **tofile** option is chosen, the results are written to a specific file with the extension ``diff``. This file contains all cognate sets in which there are differences between gold standard and test sets. It also gives detailed information regarding false positives, false negatives, and the words involved in these wrong decisions. .. This function also calculates the "transformation" score. This score is .. based on the calculation of steps that are needed to transform one cluster .. for one set of meanings into the other. Ideally, if there are *n* different .. cognate sets covering one gloss in the gold standard, the minimal length of .. a mapping to convert the *m* cognate sets of the test set into the gold standard .. is *n*. In this case, both gold standard and test set are identical. .. However, if gold standard and test set differ, the number of mappings .. necessarily exceeds *m* and *n*. Based on this, the transformation .. precision is defined as :math:`\frac{m}{M}`, where *m* is the number of .. distinct clusters in the test set and *M* is the length of the mapping. .. Accordingly, the recall is defined as :math:`\frac{n}{M}`, where *n* is the .. number of clusters in the gold standard. .. Note that if precision is lower than 1.0, this means there are false .. positive decisions in the test set. Accordingly, a recall lower than 1.0 .. indicates that there are false negative decisions in the test set. .. The drawback of this score is that it is not sensitive regarding the .. distinct number of decisions in which gold standard and test set differ, so .. the recall can be very low although most of the words have been grouped .. accurately. The advantage is that it can be directly interpreted in terms .. of 'false positive/false negative' decisions. See also -------- bcubes pairs """ filename = filename or wordlist.filename loan = modify_ref if modify_ref else identity # open file lines = [] # get a formatter for language names lform = '{0:' + str(max([len(l) for l in wordlist.cols])) + '}' preT, recT = [], [] preB, recB = [], [] preP, recP = [], [] def get_pairs(cogs, idxs): tmp = defaultdict(list) for x, y in zip(cogs, idxs): tmp[x].append(y) for x in tmp: for yA, yB in combinations(tmp[x], r=2): yield tuple(sorted([yA, yB])) for concept in wordlist.rows: idxs = wordlist.get_list(row=concept, flat=True) # get the basic index for all seqs bidx = [i + 1 for i in range(len(idxs))] cogsG = _get_cogs(gold, concept, loan, wordlist) cogsT = _get_cogs(test, concept, loan, wordlist) if cogsG != cogsT: # calculate the transformation distance of the sets tramGT = len(set(zip(cogsG, cogsT))) tramG = len(set(cogsG)) tramT = len(set(cogsT)) preT += [tramT / tramGT] recT += [tramG / tramGT] # calculate the bcubed precision for the sets preB += [_get_bcubed_score(cogsT, cogsG)] # calculate b-cubed recall recB += [_get_bcubed_score(cogsG, cogsT)] # calculate pair precision pairsG = set(get_pairs(cogsG, idxs)) pairsT = set(get_pairs(cogsT, idxs)) preP.append(len(pairsT.intersection(pairsG)) / len(pairsT) if pairsT else 1.0) recP.append(len(pairsT.intersection(pairsG)) / len(pairsG) if pairsG else 1.0) fp = "no" if preP[-1] == 1.0 else "yes" fn = "no" if recP[-1] == 1.0 else "yes" lines.append("Concept: {0}, False Positives: {1}, False Negatives: {2}".format( concept, fp, fn)) # get the words words = [wordlist[i, 'ipa'] for i in idxs] langs = [wordlist[i, 'taxa'] for i in idxs] # get a word-formater wform = '{0:' + str(max([len(w) for w in words])) + '}' # write differences to file for word, lang, cG, cT in sorted( zip(words, langs, cogsG, cogsT), key=lambda x: (x[2], x[3])): lines.append('{0}\t{1}\t{2:4}\t{3:4}'.format( lform.format(lang), wform.format(word), cG, cT)) lines.append('#') else: preT += [1.0] recT += [1.0] preB += [1.0] recB += [1.0] preP += [1.0] recP += [1.0] bp = sum(preB) / len(preB) br = sum(recB) / len(recB) bf = 2 * (bp * br) / (bp + br) pp = sum(preP) / len(preP) pr = sum(recP) / len(recP) pf = 2 * (pp * pr) / (pp + pr) as_string(_format_results('B-Cubed', bp, br, bf) + \ _format_results('Pair', pp, pr, pf), pprint=pprint) lines.extend([ 'B-Cubed Scores:', 'Precision: {0:.4f}'.format(bp), 'Recall: {0:.4f}'.format(br), 'F-Score: {0:.4f}'.format(bf), '#', 'Pair Scores:', 'Precision: {0:.4f}'.format(pp), 'Recall: {0:.4f}'.format(pr), 'F-Score: {0:.4f}'.format(pf), ]) if tofile: write_text_file(filename + '.diff', lines) if pprint: return (bp, br, bf), (pp, pr, pf), lines else: return (bp, br, bf), (pp, pr, pf)
def _write_file(filename, content, ext=None): if ext: filename = filename + '.' + ext util.write_text_file(filename, content)
def output(self, dtype, filename=None, labels=None): """ Parameters ---------- dtype : str {"json", "html", "nwk" } Specify the type of the output: * *json*: JSON format, suitable for use in d3. * *nwk*: Newick format (identical with input upon initialization). * *html*: Simple interactive HTML-representation with collapsible nodes. """ if dtype == "json": if filename: with open(filename + "." + dtype, "w") as f: f.write(json.dumps(self._dict, indent=2)) else: return json.dumps(self._dict, indent=2) elif dtype == "html": # make simple label function get_label = lambda x: labels[x] if labels else x start = '<div id="root" class="node-container">root.content</div>' clean_label = lambda x: "".join([y for y in sort_tree(x) if y not in "();"]).replace(",", "_") template = '<div class="node-container"><div id="#node_name:label" class="node-label">#node_label</div><div class="node-content">#node_children:{node}</div></div>' leave = '<div id="#node_leave:label" class="node-leave"><div class="inner_leave">#node_leave</div></div>' txt = ( template.format(node=self.root) .replace("#node_label", get_label(self[self.root]["label"])) .replace("#node_name", clean_label(self.root)) ) # transform function helps to make the transformation with check # for leave or child transform = ( lambda x: template.format(node=x) .replace("#node_label", get_label(self[x]["label"])) .replace("#node_name", clean_label(x)) if not self[x]["leave"] else leave.replace("#node_leave", get_label(x)) ) for i, node in enumerate(self.nodes): # write all children children = self[node]["children"] node_children = "\n".join([transform(child) for child in children]) txt = txt.replace("#node_children:" + node, node_children) # get the templates html = util.read_text_file("lexical_change.html") css = util.read_text_file("lexical_change.css") js = util.read_text_file("lexical_change.js") title = "LingPy Tree Class" html = html.format(STYLE=css, SCRIPT=js, TITLE=title, TREE=txt) filename = filename or "lingpy.basic.newick" util.write_text_file(filename + ".html", html)
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def alm2html(infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = { a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), ) } else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format( d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int( l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format(colors[abs(int(l[0]))], tmp, loan_line, l[1]) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format(shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords) util.write_text_file(filename + '.html', html) return
def msa2tex(infile, template='', filename='', **keywords): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def matrix2dst(matrix, taxa=None, stamp='', filename='', taxlen=10, comment='#'): """ Convert matrix to dst-format. Parameters ---------- taxa : {None, list} List of taxon names corresponding to the distances. Make sure that you only use alphanumeric characters and the understroke for assigning the taxon names. Especially avoid the usage of brackets, since this will confuse many phylogenetic programs. stamp : str (default='') Convenience stamp passed as a comment that can be used to indicate how the matrix was created. filename : str If you specify a filename, the data will be written to file. taxlen : int (default=10) Indicate how long the taxon names are allowed to be. The Phylip package only allows taxon names consisting of maximally 10 characters. Other packages, however, allow more. If Phylip compatibility is not important for you and you just want to allow for as long taxon names as possible, set this value to 0. comment : str (default = '#') The comment character to be used when adding additional information in the "stamp". Returns ------- output : {str or file} Depending on your settings, this function returns a string in DST (=Phylip) format, or a file containing the string. """ if not taxa: taxa = ['t_{0}'.format(i + 1) for i in range(len(matrix))] out = ' {0}\n'.format(len(taxa)) for i, taxon in enumerate(taxa): # check for zero-taxlen if taxlen == 0: dummy = '{0}\t' idx = len(taxon) joinchar = '\t' # normally in Phylip this is a space else: dummy = '{0:' + str(taxlen) + '}' idx = taxlen + 1 joinchar = ' ' out += dummy.format(taxon)[:idx] + joinchar out += joinchar.join(['{0:.2f}'.format(d) for d in matrix[i]]) out += '\n' if stamp: out += '{1} {0}'.format(stamp, comment) if not filename: return out else: util.write_text_file(filename + '.dst', out)
def wl2qlc( header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults( keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (text_type, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: ( data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += text_type(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([text_type(v) for v in value]) elif type(value) == int: out += '\t' + text_type(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file( filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def wl2qlc(header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults(keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json' ] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (str, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: (data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += str(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([str(v) for v in value]) elif type(value) == int: out += '\t' + str(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file(filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def pap2nex( taxa, paps, missing=0, filename='' ): """ Function converts a list of paps into nexus file format. Parameters ---------- taxa : list List of taxa. paps : {list, dict} A two-dimensional list with the first dimension being identical to the number of taxa and the second dimension being identical to the number of paps. If a dictionary is passed, each key represents a given pap. The following two structures will thus be treated identically:: >>> paps = [[1,0],[1,0],[1,0]] # two languages, three paps >>> paps = {1:[1,0], 2:[1,0], 3:[1,0]} # two languages, three paps missing : {str, int} (default=0) Indicate how missing characters are represented in the original data. """ out = '#NEXUS\n\nBEGIN DATA;\nDIMENSIONS ntax={0} NCHAR={1};\n' out += "FORMAT DATATYPE=STANDARD GAP=- MISSING={2} interleave=yes;\n" out += "MATRIX\n\n{3}\n;\n\nEND;\n" out += "[PAPS-REFERENCE]\n{4}" # get longest taxon maxTax = max([len(taxon) for taxon in taxa]) paps_ref = "" # check whether paps are dict or list if hasattr(paps, 'keys'): new_paps = [paps[k] for k in sorted(paps)] reference = [k for k in sorted(paps)] else: new_paps = paps reference = [k for k in range(1, len(paps)+1)] # create reference ref_string = '' for i, ref in enumerate(reference): ref_string += '[{0} :: {1}]\n'.format(i, ref) # create the matrix matrix = "" for i, taxon in enumerate(taxa): tmp = '{0:XXX} ' matrix += tmp.replace('XXX', str(maxTax)).format(taxon) matrix += ''.join([str(itm[i]) for itm in new_paps]) matrix += '\n' if not filename: return out.format( len(taxa), len(paps), missing, matrix, ref_string ) util.write_text_file( filename + '.nex', out.format(len(taxa), len(paps), missing, matrix, ref_string)) return
def write_nexus(taxa, matrix, custom=None, custom_name='lingpy', missing="?", gap="-", template="mrbayes.nex", filename="mrbayes.nex", dtype="RESTRICTION", symbols="10", commands=None, commands_name="mrbayes"): """Write a nexus file for phylogenetic analyses. Parameters ---------- taxa : list The taxonomic units in your data. They should be valid taxon names, only consisting of alphanumeric characters and an underscore, usually also not exceeding a length of 15 characters. matrix : list The matrix with the values for each taxon in one separate row. Usually, the matrix contains binary values which can be passed as strings or integers (1 and 0), but missing values are also possible. Given biological common restrictions, each character can only be one ASCII symbol. custom : list {default=None) This information allows to add custom information to the nexus file, like, for example, the structure of the characters, their original concept, or their type, and it will be written into a custom block in the nexus file. The name of the custom block can be specified with help of the `custom_name` keyword. The content is a list of strings which will be written line by line into the custom block. custom_name : str (default="lingpy") The name of the custom block which will be written to the file. missing : str (default="?") The symbol for missing characters. gap : str (default="-") The symbol for gaps (not relevant for linguistic analyses). template : str (default="mrbayes.nex") The name of the template file. This file is located in the template/ folder of the LingPy package, but a custom file can be specified by providing the path. dtype : str (default="RESTRICTION") The datatype, which is usually "STANDARD" or "RESTRICTION" in linguistic analyses, with "RESTRICTION" pointing to pure birth-death models. symbols : str (default="10") The symbols used for the characters. commands : list (default=None) If specified, will write an additional block containing commands for phylogenetic software. The commands are passed as a list, containing strings. The name of the block is given by the keywords commands_name. commands_name : str (default="mrbayes") Determines how the block will be called to which the commands will be written. """ tpath = util.Path(template_path(template)) if tpath.exists: _template = util.read_text_file(tpath.as_posix()) else: util.read_text_file(template) _commands = 'BEGIN {0};\n{1}\n\n'.format( commands_name, '\n'.join(commands)) if commands else '' _custom = 'BEGIN {0};\n{1}\n\n'.format(custom_name, '\n'.join(custom)) if custom else '' _matrix = "" mtl = max([len(t) for t in taxa]) + 1 for i, (t, m) in enumerate(zip(taxa, matrix)): _matrix += str(t + mtl * ' ')[:mtl] + ' ' _matrix += ''.join( ['({0})'.format(c) if len(c) > 1 else str(c) for c in m]) + '\n' text = _template.format(matrix=_matrix, ntax=len(taxa), nchar=len(matrix[0]), gap=gap, missing=missing, dtype=dtype, commands=_commands, custom=_custom, symbols=symbols) util.write_text_file(filename, text)
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def alm2html( infile, title='', shorttitle='', filename='', colored=False, main_template='', table_template='', dataset='', confidence=False, **keywords ): """ Convert files in ``alm``-format into colored ``html``-format. Parameters ---------- title : str Define the title of the output file. If no title is provided, the default title ``LexStat - Automatic Cognate Judgments`` will be used. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``LexStat`` will be used. Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.msa2html lingpy.convert.html.msa2tex """ util.setdefaults(keywords, json="", labels={}) # open the infile if not os.path.exists(infile): infile = infile + '.alm' data = util.read_text_file(infile) # create the outfile if not filename: filename = rcParams['filename'] # read in the templates html = util.read_text_file(main_template or template_path('alm2html.html')) if not table_template: table_template = template_path( 'alm2html.table.js.html' if confidence else 'alm2html.table.html') table = util.read_text_file(table_template) css = util.read_text_file(template_path('alm.css')) js = util.read_text_file(template_path('alm.js')) # define a label function for the taxa label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x # check for windows-compatibility data = data.replace(os.linesep, '\n')[:-1] # split the data into blocks blocks = data.split('\n\n') # retrieve the dataset dataset = dataset or blocks[0] # create the outstring tmp_str = '' for block in blocks[1:]: lines = block.split('\n') m = [l.split('\t') for l in lines] # create colordict for different colors dc = len(set([l[0] for l in m])) if colored: colors = {a: b for a, b in zip( sorted(set([int(l[0]) for l in m])), colorRange(dc, brightness=400), )} else: colors = [] white = True for i in sorted(set([abs(int(l[0])) for l in m])): if white: colors.append((i, 'white')) white = False else: colors.append((i, 'gray')) white = True colors = dict(colors) # get the basic item and its id iName = m[0][2] iID = m[0][3] # start writing the stuff to string tmp_str += table.format(NAME=iName, ID=iID) # define the basic string for the insertion bas = ' <tr class="{0}{2} taxon" taxon="{3}">\n{1}' for tracer, l in enumerate(m): # check whether the current line is a borrowing if int(l[0]) < 0: loan_line = ' loan' else: loan_line = '' # assign the cognate id tmp = ' <td>{0}</td>\n'.format(l[0]) tmp += ' <td>{0}</td>\n'.format(label(l[1].strip('.'))) # check alignments for confidence scores ipa_string = ''.join([cell.split('/')[0] for cell in l[4:]]).replace('-', '') tmp += ' <td>{0}</td>\n'.format(ipa_string) tmp += ' <td class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <table class="{0}">\n'.format(colors[abs(int(l[0]))]) tmp += ' <tr>\n{0} </tr>\n </table>\n </td>\n </tr>\n' # check whether another entry follows that is also an alignment, # otherwise, there's no need to display a word as an alignment cognate_set = False if tracer < len(m) - 1: if abs(int(m[tracer + 1][0])) == abs(int(l[0])): cognate_set = True if tracer > 0: if abs(int(m[tracer - 1][0])) == abs(int(l[0])): cognate_set = True # fill out html for the cognate sets if cognate_set: alm = '' for char in l[4:]: # check for confidence scores if '/' in char: try: char, conf, num = char.split('/') conf = int(conf) except ValueError: print(char.split('/')) raise ValueError("Something is wrong with %s." % (char)) else: char, conf, rgb = char, (255, 255, 255), 0.0 if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if confidence: alm += ' ' alm += '<td class="char {1}" confidence={0} '.format( conf, d ) alm += 'char="{0}" '.format(char) alm += 'onclick="' + "show('{0}')".format(num) + '" ' alm += 'num="{0}"'.format(num) alm += '>\n {0}\n </td>\n'.format(char) else: alm += ' ' alm += '<td class="char {0}">{1}</td>\n'.format(d, char) else: alm = ' ' alm += '<td class="{0}">--</td>\n'.format(colors[abs(int(l[0]))]) # format the alignment try: tmp = tmp.format(alm) except ValueError: raise ValueError("Unknown problem in matchin %s and %s." % (alm, tmp)) # check for last line, where a new line should be inserted (not the # fastest solution, but plotting is not a matter of time, and it # suffices it's current purpose if tracer < len(m) - 1: pass else: if confidence: tmp += ' </table>\n' tmp += ' <tr class="empty"><td colspan="4" class="empty">' tmp += '<hr class="empty" /></td></tr>\n' # format the whole string tmp_str += bas.format( colors[abs(int(l[0]))], tmp, loan_line, l[1] ) if not title: title = "LexStat - Automatic Cognate Judgments" if not shorttitle: shorttitle = "LexStat" # check for json-attribute if keywords['json']: keywords['json'] = 'var myjson = ' + json.dumps(keywords['json'], indent=1) html = html.format( shorttitle=shorttitle, title=title, table=tmp_str, dataset=dataset, javascript=js, css=css, **keywords ) util.write_text_file(filename + '.html', html) return
def msa2tex( infile, template='', filename='', **keywords ): """ Convert an MSA to a tabular representation which can easily be used in LaTeX documents. """ util.setdefaults(keywords, pid_mode=1) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load msa msa = read_msa(infile) ## load templates tex = util.read_text_file(template or template_path('msa.tex')) # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] dataset = msa['dataset'] infile = msa['infile'] seq_id = msa['seq_id'] # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) height = len(msa['alignment']) width = len(msa['alignment'][0]) start = r'\tabular{l' + width * 'c' + '}\n' start += r'\bf\ttfamily Taxon & \multicolumn{' + str( width) + r'}{l}{\bf\ttfamily Alignment}\\' + '\n' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] body = start for i, taxon in enumerate(msa['taxa']): body += r'\ttfamily ' + taxon.replace('_', r'\_') for j, char in enumerate(msa['alignment'][i]): if char != '-': cls = token2class(char, rcParams['dolgo']) elif char == '-': cls = 'X' if char == '_': char = r'\#' if cls == '_': cls = '2' if j not in swaps: body += r'&\cellcolor{col' + cls + r'}' + char else: if char != '-': body += r'&\cellcolor{col' + cls + r'}\color{white}\bf ' + char else: body += r'&\cellcolor{col' + cls + r'}\bf ' + char body += r'\\' + '\n' body += r'&' + '&'.join([r'\color{white}XXX' for i in range(width)]) + r'\\' + '\n' body += r'\endtabular' + '\n' # create the parameters etc. w = 1.5 * width + taxl * 0.25 h = 0.5 * height + 1.0 tex = tex.replace('<+WIDTH+>', '{0:2f}'.format(w)) tex = tex.replace('<+HEIGHT+>', '{0:2f}'.format(h)) # create the rput stuff tex = tex.replace('<+NEWX+>', '{0:.2f}'.format(w / 2.0)) tex = tex.replace('<+NEWY+>', '{0:.2f}'.format((h - 0.5) / 2.0)) # insert the rest tex = tex.replace('<+CONTENT+>', body) # write to file if not filename: filename = 'lingpy-{0}' util.write_text_file(filename + '.tex', tex)
def cognate_detection(self, **keywords): """ Method runs a cognate detection analysis. """ kw = dict( align_method='progressive', align_mode=rcParams['align_mode'], align_modes=rcParams['align_modes'], cluster_method=rcParams['lexstat_cluster_method'], cognate_method='sca', cognate_mode='overlap', defaults=False, factor=rcParams['align_factor'], gap_weight=rcParams['gap_weight'], gop=rcParams['align_gop'], iteration=False, lexstat_modes=rcParams['lexstat_modes'], limit=rcParams['lexstat_limit'], merge_vowels=rcParams['merge_vowels'], model=rcParams['sca'], export="html", preprocessing=False, preprocessing_method=rcParams['lexstat_preprocessing_method'], preprocessing_threshold=rcParams['lexstat_preprocessing_threshold'], rands=rcParams['lexstat_rands'], ratio=rcParams['lexstat_ratio'], ref="customid", restricted_chars=rcParams['restricted_chars'], restriction='', runs=rcParams['lexstat_runs'], scale=rcParams['align_scale'], scoring_method=rcParams['lexstat_scoring_method'], swap_check=False, threshold=rcParams['lexstat_threshold'], tree_calc=rcParams['align_tree_calc'], vscale=rcParams['lexstat_vscale'], outfile=False, sonar=True, ) # first load kw.update(keywords) if kw['defaults']: return kw # carry out lexstat cluster analysis self.lex = LexStat(self.infile, **kw) # reset filename if it is not defined kw['outfile'] = kw['outfile'] or self.lex.filename + '_lingpy' # check for traditional lexstat analysis if kw['cognate_method'] == 'lexstat': self.lex.get_scorer( method=kw['scoring_method'], modes=kw['lexstat_modes'], **kw) self.lex.cluster(method=kw['cognate_method'], mode=kw['cognate_mode'], **kw) # align the data self.alms = Alignments(self.lex, **kw) kw['scoredict'] = self.lex.cscorer \ if kw['cognate_method'] == 'lexstat' else self.lex.bscorer self.alms.align( method=kw['align_method'], mode=kw['align_mode'], modes=kw['align_modes'], **kw) if 'tsv' in kw['export']: self.alms.output( 'tsv', filename=kw['outfile'], ignore=['scorer', 'json', 'taxa', 'msa'], **kw) if 'html' in kw['export']: corrs, occs = get_correspondences(self.alms, kw['ref']) # serialize the wordlist wl = {} for concept in self.alms.concepts: entries = self.alms.get_list(concept=concept, flat=True) cogids = [self.alms[idx, kw['ref']] for idx in entries] words = [self.alms[idx, 'ipa'] for idx in entries] alms = [self.alms[idx, 'alignment'] for idx in entries] langs = [self.alms[idx, 'doculect'] for idx in entries] checkalm = lambda x: x if type(x) == str else ' '.join(x) wl[concept] = [list(k) for k in sorted( zip( langs, [str(x) for x in entries], words, [str(x) for x in cogids], [checkalm(x) for x in alms], ), key=lambda x: int(x[3]))] # make simple gloss id for internal use as id gloss2id = list( zip( self.alms.concepts, [str(x) for x in range(1, len(self.alms.concepts) + 1)])) id2gloss = dict([[b, a] for a, b in gloss2id]) gloss2id = dict(gloss2id) txt = '' txt += 'CORRS = ' + json.dumps(corrs) + ';\n' txt += 'LANGS = ' + json.dumps(self.alms.taxa) + ';\n' txt += 'OCCS = ' + json.dumps(occs) + ';\n' txt += 'WLS = ' + json.dumps(wl) + ';\n' txt += 'GlossId = ' + json.dumps(gloss2id) + ';\n' txt += 'IdGloss = ' + json.dumps(id2gloss) + ';\n' txt += 'FILE = "' + kw['outfile'] + '.tsv";\n' tpath = partial(util.data_path, 'templates') tname = 'jcov.{0}.html'.format( 'remote' if 'remote' in kw['export'] else 'direct') content = util.read_text_file(tpath(tname)) util.write_text_file( kw['outfile'] + '.html', content.format( CORRS=txt, JCOV=util.read_text_file(tpath('jcov.js')), STYLE=util.read_text_file(tpath('jcov.css')), VENDOR=util.read_text_file(tpath('jcov.vendor.js')), DIGHL=util.read_text_file(tpath('jcov.dighl.js'))))
def psa2html(infile, **kw): """ Function converts a PSA-file into colored html-format. """ util.setdefaults( kw, template=False, css=False, comment='#', filename=infile[:-4]+'.html', compact=True) template = util.read_text_file(kw['template'] or template_path('psa.html')) css = util.read_text_file(kw['css'] or template_path('psa.css')) data = [] for line in util.read_text_file(infile, lines=True): if not line.startswith(kw['comment']): data.append(line) seq_ids = [] pairs = [] taxa = [] alignments = [] del data[0] i = 0 while i <= len(data) - 3: try: seq_ids.append(data[i]) datA = data[i + 1].split('\t') datB = data[i + 2].split('\t') taxonA = datA[0].strip('.') taxonB = datB[0].strip('.') almA = datA[1:] almB = datB[1:] taxa.append((taxonA, taxonB)) pairs.append( ( '.'.join([k for k in almA if k != '-']), '.'.join([k for k in almB if k != '-']) ) ) alignments.append( ( [str(a) for a in almA], [str(b) for b in almB], 0) ) assert len(alignments[-1][0]) == len(alignments[-1][1]) i += 4 except AssertionError: log.warning("Line {0} of the data is probably miscoded.".format(i + 1)) i += 1 def get_classes(alm): classes = [] residue = '<div class="residue {1}">{0}</div>' for j, char in enumerate(alm): if char == '-': d = 'dolgo_GAP' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' classes += [residue.format(char, d)] return ''.join(classes) out = '<table>\n' # codecs.open(kw['filename'], 'w', 'utf-8') for i, (a, b, c) in enumerate(alignments): clsA = get_classes(a) clsB = get_classes(b) ids = int(100 * pid(a, b) + 0.5) out += '<tr class="head">' out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format( i + 1, seq_ids[i], ids ) out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][0]) out += '<td class="psa">{0}</td>'.format(clsA) out += '</tr>' out += '<tr class="psa">' out += '<td class="taxon">{0}</td>'.format(taxa[i][1]) out += '<td class="psa">{0}</td>'.format(clsB) out += '</tr>' out += '<tr><td colspan=2></td></tr>' out += '</table>' html = template.format(alignments=out, css=css) if kw['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') util.write_text_file(kw['filename'], html)