Python wl2dstの例、lingpy.basic.ops.wl2dst Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_ops.py プロジェクト: anukat2015/lingpy

    def test_wl2dst(self):
        from lingpy.basic.ops import wl2dst

        res = wl2dst(self.wordlist, mode='jaccard')
        self.assertIsInstance(res, list)
        res = wl2dst(self.wordlist, mode='jaccard', refB='glossid')
        self.assertIsInstance(res, list)

コード例 #2

0

ファイルを表示

ファイル: test_ops.py プロジェクト: LinguList/lingpy

    def test_wl2dst(self):
        res = wl2dst(self.wordlist, mode='jaccard')
        self.assertIsInstance(res, list)
        res = wl2dst(self.wordlist, mode='jaccard', refB='glossid')
        self.assertIsInstance(res, list)

        _ = wl2dst(self.wordlist, mode='swadesh')
        _ = wl2dst(self.wordlist, mode='shared')
        _ = wl2dst(self.wordlist, mode='swadesh', ignore_missing=True)

        # trigger zero-division-warning in wl2dst
        tmp = Wordlist({
            0: ['doculect', 'concept', 'counterpart', 'cogid'],
            1: ['l1', 'hand', 'hand', '1'],
            2: ['l2 - a (taxon) name)', 'hand', 'hand', '2'],
            3: ['l3', 'foot', 'foot', '3']
        })
        dst = wl2dst(tmp)
        assert dst[0][2] == 1

コード例 #3

0

ファイルを表示

ファイル: test_ops.py プロジェクト: vermillionbee/lingpy

    def test_wl2dst(self):
        res = wl2dst(self.wordlist, mode='jaccard')
        self.assertIsInstance(res, list)
        res = wl2dst(self.wordlist, mode='jaccard', refB='glossid')
        self.assertIsInstance(res, list)

        _ = wl2dst(self.wordlist, mode='swadesh')
        _ = wl2dst(self.wordlist, mode='shared')
        _ = wl2dst(self.wordlist, mode='swadesh', ignore_missing=True)

        # trigger zero-division-warning in wl2dst
        tmp = Wordlist({
            0: ['doculect', 'concept', 'counterpart', 'cogid'],
            1: ['l1', 'hand', 'hand', '1'],
            2: ['l2 - a (taxon) name)', 'hand', 'hand', '2'],
            3: ['l3', 'foot', 'foot', '3']
        })
        dst = wl2dst(tmp)
        assert dst[0][2] == 1

コード例 #4

0

ファイルを表示

ファイル: wordlist.py プロジェクト: kadster/lingpy

    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')

コード例 #5

0

ファイルを表示

ファイル: wordlist.py プロジェクト: LinguList/lingpy

    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')