def test_wl2dst(self): from lingpy.basic.ops import wl2dst res = wl2dst(self.wordlist, mode='jaccard') self.assertIsInstance(res, list) res = wl2dst(self.wordlist, mode='jaccard', refB='glossid') self.assertIsInstance(res, list)
def test_wl2dst(self): res = wl2dst(self.wordlist, mode='jaccard') self.assertIsInstance(res, list) res = wl2dst(self.wordlist, mode='jaccard', refB='glossid') self.assertIsInstance(res, list) _ = wl2dst(self.wordlist, mode='swadesh') _ = wl2dst(self.wordlist, mode='shared') _ = wl2dst(self.wordlist, mode='swadesh', ignore_missing=True) # trigger zero-division-warning in wl2dst tmp = Wordlist({ 0: ['doculect', 'concept', 'counterpart', 'cogid'], 1: ['l1', 'hand', 'hand', '1'], 2: ['l2 - a (taxon) name)', 'hand', 'hand', '2'], 3: ['l3', 'foot', 'foot', '3'] }) dst = wl2dst(tmp) assert dst[0][2] == 1
def _output(self, fileformat, **keywords): """ Internal function that eases its modification by daughter classes. """ # check for stamp attribute keywords["stamp"] = getattr(self, '_stamp', '') # add the default parameters, they will be checked against the keywords util.setdefaults( keywords, cols=False, distances=False, entries=("concept", "counterpart"), entry='concept', fileformat=fileformat, filename=rcParams['filename'], formatter='concept', modify_ref=False, meta=self._meta, missing=0, prettify='false', ignore='all', ref='cogid', rows=False, subset=False, # setup a subset of the data, taxa='taxa', threshold=0.6, # threshold for flat clustering tree_calc='neighbor') if fileformat in ['triple', 'triples', 'triples.tsv']: return tsv2triple(self, keywords['filename'] + '.' + fileformat) if fileformat in ['paps.nex', 'paps.csv']: paps = self.get_paps( ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing']) kw = dict(filename=keywords['filename'] + '.paps') if fileformat == 'paps.nex': kw['missing'] = keywords['missing'] return pap2nex(self.cols, paps, **kw) return pap2csv(self.cols, paps, **kw) # simple printing of taxa if fileformat == 'taxa': assert hasattr(self, 'taxa') return util.write_text_file(keywords['filename'] + '.taxa', self.cols) # csv-output if fileformat in ['csv', 'qlc', 'tsv']: # get the header line header = sorted( [s for s in set(self._alias.values()) if s in self._header], key=lambda x: self._header[x]) header = [h.upper() for h in header] self._meta.setdefault('taxa', self.cols) # get the data, in case a subset is chosen if not keywords['subset']: # write stuff to file return wl2qlc(header, self._data, **keywords) cols, rows = keywords['cols'], keywords['rows'] if not isinstance(cols, (list, tuple, bool)): raise ValueError("[i] Argument 'cols' should be list or tuple.") if not isinstance(rows, (dict, bool)): raise ValueError("[i] Argument 'rows' should be a dictionary.") # check for chosen header if cols: # get indices for header indices = [self._header[x] for x in cols] header = [c.upper() for c in cols] else: indices = [r for r in range(len(self.header))] if rows: stmts = [] for key, value in rows.items(): if key == 'ID': stmts += ["key " + value] else: idx = self._header[key] stmts += ["line[{0}] ".format(idx) + value] log.debug("calculated what should be excluded") # get the data out = {} for key, line in self._data.items(): log.debug(key) if rows: if eval(" and ".join(stmts)): out[key] = [line[i] for i in indices] else: out[key] = [line[i] for i in indices] log.debug("passing data to wl2qlc") return wl2qlc(header, out, **keywords) # output dst-format (phylip) if fileformat == 'dst': # check for distances as keyword if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self, **keywords) out = matrix2dst(self._meta['distances'], self.taxa, stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0)) return _write_file(keywords['filename'], out, fileformat) # output tre-format (newick) if fileformat in ['tre', 'nwk']: # ,'cluster','groups']: if 'tree' not in self._meta: # check for distances if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # we look up a function to calculate a tree in the cluster module: tree = getattr(cluster, keywords['tree_calc'])( self._meta['distances'], self.cols, distances=keywords['distances']) else: tree = self._meta['tree'] return _write_file(keywords['filename'], '{0}'.format(tree), fileformat) if fileformat in ['cluster', 'groups']: if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # check for keywords if 'groups' not in self._meta: self._meta['groups'] = cluster.matrix2groups( keywords['threshold'], self._meta['distances'], self.taxa) lines = [] for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]): lines.append('{0}\t{1}'.format(taxon, group)) return _write_file(keywords['filename'], lines, fileformat) if fileformat in ['starling', 'star.csv']: # make lambda inline for data-check l = lambda x: ['-' if x == 0 else x][0] lines = [] if 'cognates' not in keywords: lines.append('ID\tConcept\t' + '\t'.join(self.taxa)) for i, concept in enumerate(self.concepts): for line in self.get_list(row=concept, entry=keywords['entry']): lines.append( str(i + 1) + '\t' + concept + '\t' + '\t'.join( [l(t) for t in line])) else: lines.append( 'ID\tConcept\t' + '\t'.join( ['{0}\t COG'.format(t) for t in self.taxa])) for i, concept in enumerate(self.concepts): cogs = self.get_list(row=concept, entry=keywords['cognates']) for j, line in enumerate( self.get_list(row=concept, entry=keywords['entry'])): part = '\t'.join( '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j])) lines.append(util.tabjoin(i + 1, concept, part)) return _write_file( keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv') if fileformat == 'multistate.nex': if not keywords['filename'].endswith('.multistate.nex'): keywords['filename'] += '.multistate.nex' matrix = wl2multistate(self, keywords['ref'], keywords['missing']) return multistate2nex(self.taxa, matrix, keywords['filename']) if fileformat == 'separated': if not os.path.isdir(keywords['filename']): os.mkdir(keywords['filename']) for l in self.cols: lines = [''] if 'ignore_keys' in keywords else ['ID\t'] lines[0] += '\t'.join(x.upper() for x in keywords['entries']) for key in self.get_list(col=l, flat=True): line = [] if 'ignore_keys' in keywords else [key] for entry in keywords['entries']: tmp = self[key, entry] if isinstance(tmp, list): tmp = ' '.join([str(x) for x in tmp]) line += [tmp] lines.append('\t'.join('{0}'.format(x) for x in line)) _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')