def test_msa2str(self): aranger = '{body}{meta}' # read msa traditionally into an object msaA = lingpy.MSA(test_data('harry.msa')) # read msa from dictionary msaB = lingpy.read.qlc.read_msa(test_data('harry.msa')) # read msa with IDs msaC = lingpy.read.qlc.read_msa(test_data('harry_with_ids.msa'), ids=True, header=False) # we adjust the dataset and the seq_id since otherwise we won't have # similar output msaC['seq_id'] = 'test' msaC['dataset'] = 'file' # when converting these different objects to string with the same body and # the like, they should be identical, so we check this here strA = msa2str(msaA, _arange=aranger) strB = msa2str(msaB, _arange=aranger) strC = msa2str(msaC, _arange=aranger, wordlist=False) assert strA == strB == strC # we next test for converting with the merging attribute strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True) strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False) # remove tabstops for checking similar strings strDst = strD.replace('\t', '') strEst = strE.replace('\t', '') # get index until 'COLUMN' idx = strDst.index('COLUMNID') assert strD != strE and strDst[:idx] == strEst[:idx] # add a consensus string to all msa objects consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB), gaps=True) consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC), gaps=True) msaB['consensus'] = consensusA msaC['consensus'] = consensusB assert msa2str(msaB) == msa2str(msaC, wordlist=False)
def test_msa2str(self): aranger = '{body}{meta}' # read msa traditionally into an object msa_a = MSA(test_data('harry.msa')) # read msa from dictionary msa_b = qlc.read_msa(test_data('harry.msa')) # read msa with IDs msa_c = qlc.read_msa(test_data('harry_with_ids.msa'), ids=True, header=False) # we adjust the dataset and the seq_id since otherwise we won't have # similar output msa_c['seq_id'] = 'test' msa_c['dataset'] = 'file' # when converting these different objects to string with the same body # and the like, they should be identical, so we check this here str_a = msa2str(msa_a, _arange=aranger) str_b = msa2str(msa_b, _arange=aranger) str_c = msa2str(msa_c, _arange=aranger, wordlist=False) assert str_a == str_b == str_c # we next test for converting with the merging attribute str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True) str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False) # remove tabstops for checking similar strings str_d_st = str_d.replace('\t', '') str_e_st = str_e.replace('\t', '') # get index until 'COLUMN' idx = str_d_st.index('COLUMNID') assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx] # add a consensus string to all msa objects consensus_a = get_consensus(MSA(msa_b), gaps=True) consensus_b = get_consensus(MSA(msa_c), gaps=True) msa_b['consensus'] = consensus_a msa_c['consensus'] = consensus_b assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
def test_msa2str(self): aranger = '{body}{meta}' # read msa traditionally into an object msaA = lingpy.MSA(test_data('harry.msa')) # read msa from dictionary msaB = lingpy.read.qlc.read_msa(test_data('harry.msa')) # read msa with IDs msaC = lingpy.read.qlc.read_msa( test_data('harry_with_ids.msa'), ids=True, header=False) # we adjust the dataset and the seq_id since otherwise we won't have # similar output msaC['seq_id'] = 'test' msaC['dataset'] = 'file' # when converting these different objects to string with the same body and # the like, they should be identical, so we check this here strA = msa2str(msaA, _arange=aranger) strB = msa2str(msaB, _arange=aranger) strC = msa2str(msaC, _arange=aranger, wordlist=False) assert strA == strB == strC # we next test for converting with the merging attribute strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True) strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False) # remove tabstops for checking similar strings strDst = strD.replace('\t', '') strEst = strE.replace('\t', '') # get index until 'COLUMN' idx = strDst.index('COLUMNID') assert strD != strE and strDst[:idx] == strEst[:idx] # add a consensus string to all msa objects consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB), gaps=True) consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC), gaps=True) msaB['consensus'] = consensusA msaC['consensus'] = consensusB assert msa2str(msaB) == msa2str(msaC, wordlist=False)
def wl2qlc(header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults(keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json' ] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (str, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: (data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += str(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([str(v) for v in value]) elif type(value) == int: out += '\t' + str(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file(filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return
def wl2qlc( header, data, filename='', formatter='concept', **keywords): """ Write the basic data of a wordlist to file. """ util.setdefaults( keywords, ignore=['taxa', 'doculects', 'msa'], fileformat='qlc', prettify=True) if keywords['ignore'] == 'all': keywords['ignore'] = [ 'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'] formatter = formatter.upper() if not filename: filename = rcParams['filename'] # create output string out = '# Wordlist\n' if keywords['prettify'] else '' # write meta to file meta = keywords.get("meta", {}) kvpairs = {} jsonpairs = {} msapairs = {} trees = {} distances = '' taxa = '' scorer = '' for k, v in meta.items(): # simple key-value-pairs if isinstance(v, (text_type, int)) or k == "tree": kvpairs[k] = v elif k == 'msa' and k not in keywords['ignore']: # go a level deeper, checking for keys for ref in v: if ref not in msapairs: msapairs[ref] = {} for a, b in v[ref].items(): msapairs[ref][a] = b elif k == 'distances': distances = matrix2dst(v, meta['taxa']) elif k in ['taxa', 'doculect', 'taxon', 'doculects']: # we need to find a better solution here, since it is not nice to # have taxa written to json again and again pass elif k == 'trees' and k not in keywords['ignore']: trees = '' for key, value in v.items(): trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value) elif k == 'scorer' and k not in keywords['ignore']: for key, value in v.items(): scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format( key, scorer2str(value), k) else: # check whether serialization works try: json.dumps(v) jsonpairs[k] = v except TypeError: pass if kvpairs and 'meta' not in keywords['ignore']: out += '\n# META\n' if keywords['prettify'] else '' for k, v in sorted(kvpairs.items(), key=lambda x: x[0]): out += '@{0}:{1}\n'.format(k, v) if taxa and keywords['taxa']: out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n' if jsonpairs and 'json' not in keywords['ignore']: out += "@json: " + json.dumps(jsonpairs) + '\n' if msapairs and 'msa' not in keywords['ignore']: for ref in msapairs: out += "\n# MSA reference: {0}\n".format(ref) for k, v in msapairs[ref].items(): if 'consensus' in v: out += '#\n<msa ' out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format( k, ref, ' '.join(v['consensus'])) else: out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref) outs = msa2str(v, wordlist=True) out += outs out += "</msa>\n" if distances and 'distances' not in keywords['ignore']: out += '\n# DISTANCES\n<dst>\n' out += distances + '</dst>\n' if trees: out += '\n# TREES\n' + trees if scorer and 'scorer' not in keywords['ignore']: out += '\n# SCORER\n' + scorer out += '\n# DATA\n' if keywords['prettify'] else '' out += 'ID\t' + '\t'.join(header) + '\n' # check for gloss in header to create nice output format if formatter in header: idx = header.index(formatter) formatter = None sorted_data = sorted(data.keys(), key=lambda x: data[x][idx]) elif len(formatter.split(',')) == 2: idxA, idxB = formatter.split(',') idxA = header.index(idxA) idxB = header.index(idxB) idx = idxA formatter = None sorted_data = sorted(data.keys(), key=lambda x: ( data[x][idxA], data[x][idxB])) else: idx = False formatter = '' sorted_data = sorted(data.keys()) for key in sorted_data: # get the line line = data[key] # check for formatter if idx in range(len(line)): if line[idx] != formatter: out += '#\n' if keywords['prettify'] else '' formatter = line[idx] # add the key out += text_type(key) # add the rest of the values for value in line: if type(value) == list: try: out += '\t' + ' '.join(value) except: out += '\t' + ' '.join([text_type(v) for v in value]) elif type(value) == int: out += '\t' + text_type(value) elif type(value) == float: out += '\t{0:.4f}'.format(value) elif value is None: out += '\t' else: out += '\t{:}'.format(value) out += '\n' util.write_text_file( filename + '.' + keywords['fileformat'], out + keywords.get('stamp', ''), normalize="NFC") return