コード例 #1
0
ファイル: dataset.py プロジェクト: basagashka/pylexibank
    def report(self, tr_analysis, log=None):
        #
        # FIXME: write only summary into README.md
        # in case of multiple cldf datasets:
        # - separate lexemes.md and transcriptions.md
        #
        if not list(self.cldf_dir.glob('*.csv')):
            return
        lines = [
            '# %s\n' % self.metadata.title,
            'Cite the source dataset as\n',
            '> %s\n' % self.metadata.citation,
        ]

        if self.metadata.license:
            lines.extend([
                'This dataset is licensed under a %s license' %
                self.metadata.license, ''
            ])

        if self.metadata.url:
            lines.extend(['Available online at %s' % self.metadata.url, ''])

        if self.metadata.related:
            lines.extend(['See also %s' % self.metadata.related, ''])

        if self.metadata.conceptlist:
            lines.append('Conceptlists in Concepticon:')
            lines.extend([
                '- [{0}](http://concepticon.clld.org/contributions/{0})'.
                format(cl) for cl in self.metadata.conceptlist
            ])
            lines.append('')

        # add NOTES.md
        if self.dir.joinpath('NOTES.md').exists():
            lines.extend(['## Notes', ''])
            lines.extend(self.dir.joinpath('NOTES.md').read_text().split("\n"))
            lines.extend(['', ''])  # some blank lines

        synonyms = defaultdict(Counter)
        totals = {
            'languages': Counter(),
            'concepts': Counter(),
            'sources': Counter(),
            'cognate_sets': Counter(),
            'lexemes': 0,
            'lids': Counter(),
            'cids': Counter(),
            'sids': Counter(),
        }

        missing_source = []
        missing_lang = []

        param2concepticon = {
            r['ID']: r['Concepticon_ID']
            for r in self.cldf['ParameterTable']
        }
        lang2glottolog = {
            r['ID']: r['Glottocode']
            for r in self.cldf['LanguageTable']
        }

        for row in self.cldf['FormTable']:
            if row['Source']:
                totals['sources'].update(['y'])
                totals['sids'].update(row['Source'])
            else:
                missing_source.append(row)
            totals['concepts'].update([param2concepticon[row['Parameter_ID']]])
            totals['languages'].update([lang2glottolog[row['Language_ID']]])
            totals['lexemes'] += 1
            totals['lids'].update([row['Language_ID']])
            totals['cids'].update([row['Parameter_ID']])
            synonyms[row['Language_ID']].update([row['Parameter_ID']])

        for row in self.cldf['CognateTable']:
            totals['cognate_sets'].update([row['Cognateset_ID']])

        sindex = sum([
            sum(list(counts.values())) / float(len(counts))
            for counts in synonyms.values()
        ])
        langs = set(synonyms.keys())
        if langs:
            sindex /= float(len(langs))
        else:
            sindex = 0
        totals['SI'] = sindex

        stats = tr_analysis['stats']
        lsegments = len(stats['segments'])
        lbipapyerr = len(stats['bipa_errors'])
        lsclasserr = len(stats['sclass_errors'])

        def ratio(prop):
            if float(totals['lexemes']) == 0:
                return 0
            return sum(v for k, v in totals[prop].items() if k) / float(
                totals['lexemes'])

        num_cognates = sum(1 for k, v in totals['cognate_sets'].items())
        # see List et al. 2017
        # diff between cognate sets and meanings / diff between words and meanings
        try:
            cog_diversity = (num_cognates - len(totals['cids'])) \
            / (totals['lexemes'] - len(totals['cids']))
        except ZeroDivisionError:
            cog_diversity = 0.0  # no lexemes.

        badges = [
            self.build_status_badge(),
            get_badge(ratio('languages'), 'Glottolog'),
            get_badge(ratio('concepts'), 'Concepticon'),
            get_badge(ratio('sources'), 'Source'),
        ]
        if lsegments:
            badges.extend([
                get_badge((lsegments - lbipapyerr) / lsegments, 'BIPA'),
                get_badge((lsegments - lsclasserr) / lsegments,
                          'CLTS SoundClass'),
            ])
        lines.extend(['## Statistics', '\n', '\n'.join(badges), ''])
        stats_lines = [
            '- **Varieties:** {0:,}'.format(len(totals['lids'])),
            '- **Concepts:** {0:,}'.format(len(totals['cids'])),
            '- **Lexemes:** {0:,}'.format(totals['lexemes']),
            '- **Sources:** {0:,}'.format(len(totals['sids'])),
            '- **Synonymy:** {:0.2f}'.format(totals['SI']),
        ]
        if num_cognates:
            stats_lines.extend([
                '- **Cognacy:** {0:,} cognates in {1:,} cognate sets ({2:,} singletons)'
                .format(
                    sum(v for k, v in totals['cognate_sets'].items()),
                    num_cognates,
                    len([
                        k for k, v in totals['cognate_sets'].items() if v == 1
                    ])),
                '- **Cognate Diversity:** {:0.2f}'.format(cog_diversity)
            ])
        if stats['segments']:
            stats_lines.extend([
                '- **Invalid lexemes:** {0:,}'.format(
                    stats['invalid_words_count']),
                '- **Tokens:** {0:,}'.format(sum(stats['segments'].values())),
                '- **Segments:** {0:,} ({1} BIPA errors, {2} CTLS sound class errors, '
                '{3} CLTS modified)'.format(lsegments, lbipapyerr, lsclasserr,
                                            len(stats['replacements'])),
                '- **Inventory size (avg):** {:0.2f}'.format(
                    stats['inventory_size']),
            ])

        if log:
            log.info('\n'.join(['Summary for dataset {}'.format(self.id)] +
                               stats_lines))
        lines.extend(stats_lines)

        totals['languages'] = len(totals['lids'])
        totals['concepts'] = len(totals['cids'])
        totals['cognate_sets'] = bool(
            1 for k, v in totals['cognate_sets'].items() if v > 1)

        bookkeeping_languoids = []
        for lang in self.cldf['LanguageTable']:
            gl_lang = self.glottolog.cached_languoids.get(
                lang.get('Glottocode'))
            if gl_lang and gl_lang.category == 'Bookkeeping':
                bookkeeping_languoids.append(lang)

        # improvements section
        if missing_lang or missing_source or bookkeeping_languoids:
            lines.extend([
                '\n## Possible Improvements:\n',
            ])

            if missing_lang:
                lines.append(
                    "- Languages missing glottocodes: %d/%d (%.2f%%)" %
                    (len(missing_lang), totals['languages'],
                     (len(missing_lang) / totals['languages']) * 100))

            if bookkeeping_languoids:
                lines.append(
                    "- Languages linked to [bookkeeping languoids in Glottolog]"
                    "(http://glottolog.org/glottolog/glottologinformation"
                    "#bookkeepinglanguoids):")
            for lang in bookkeeping_languoids:
                lines.append(
                    '  - {0} [{1}](http://glottolog.org/resource/languoid/id/{1})'
                    .format(lang.get('Name', lang.get('ID')),
                            lang['Glottocode']))
            lines.append('\n')

        if missing_source:
            lines.append("- Entries missing sources: %d/%d (%.2f%%)" %
                         (len(missing_source), totals['lexemes'],
                          (len(missing_source) / totals['lexemes']) * 100))

        return lines
コード例 #2
0
ファイル: test_util.py プロジェクト: lexibank/pylexibank
def test_get_badge():
    for r in util.pb(list(range(10))):
        util.get_badge((r / 10.0) + 0.5, 'name')
コード例 #3
0
def cldf_report(cldf_spec, tr_analysis, badges, log, glottolog):
    lines = []
    if not list(cldf_spec.dir.glob('*.csv')):
        return lines

    if cldf_spec.module != 'Wordlist':
        return lines  # pragma: no cover

    cldf = cldf_spec.get_dataset()

    synonyms = collections.defaultdict(collections.Counter)
    totals = {
        'languages': collections.Counter(),
        'concepts': collections.Counter(),
        'sources': collections.Counter(),
        'cognate_sets': collections.Counter(),
        'lexemes': 0,
        'lids': collections.Counter(),
        'cids': collections.Counter(),
        'sids': collections.Counter(),
    }

    missing_source = []
    missing_lang = []

    param2concepticon = {
        r['ID']: r['Concepticon_ID']
        for r in cldf['ParameterTable']
    }
    lang2glottolog = {r['ID']: r['Glottocode'] for r in cldf['LanguageTable']}

    for row in cldf['FormTable']:
        if row['Source']:
            totals['sources'].update(['y'])
            totals['sids'].update(row['Source'])
        else:
            missing_source.append(row)
        totals['concepts'].update([param2concepticon[row['Parameter_ID']]])
        totals['languages'].update([lang2glottolog[row['Language_ID']]])
        totals['lexemes'] += 1
        totals['lids'].update([row['Language_ID']])
        totals['cids'].update([row['Parameter_ID']])
        synonyms[row['Language_ID']].update([row['Parameter_ID']])

    for row in cldf.get('CognateTable') or []:
        totals['cognate_sets'].update([row['Cognateset_ID']])

    sindex = sum([
        sum(list(counts.values())) / float(len(counts))
        for counts in synonyms.values()
    ])
    langs = set(synonyms.keys())
    if langs:
        sindex /= float(len(langs))
    else:
        sindex = 0  # pragma: no cover
    totals['SI'] = sindex

    if tr_analysis:
        stats = tr_analysis['stats']
    else:
        stats = collections.defaultdict(list)

    lsegments = len(stats['segments'])
    lbipapyerr = len(stats['bipa_errors'])
    lsclasserr = len(stats['sclass_errors'])

    def ratio(prop):
        if float(totals['lexemes']) == 0:
            return 0  # pragma: no cover
        return sum(v for k, v in totals[prop].items() if k) / float(
            totals['lexemes'])

    num_cognates = sum(1 for k, v in totals['cognate_sets'].items())
    # see List et al. 2017
    # diff between cognate sets and meanings / diff between words and meanings
    try:
        cog_diversity = (num_cognates - len(totals['cids'])) \
            / (totals['lexemes'] - len(totals['cids']))
    except ZeroDivisionError:
        cog_diversity = 0.0  # no lexemes.

    badges = badges[:]
    badges.extend([
        get_badge(ratio('languages'), 'Glottolog'),
        get_badge(ratio('concepts'), 'Concepticon'),
        get_badge(ratio('sources'), 'Source'),
    ])
    if lsegments:
        badges.extend([
            get_badge((lsegments - lbipapyerr) / lsegments, 'BIPA'),
            get_badge((lsegments - lsclasserr) / lsegments, 'CLTS SoundClass'),
        ])
    lines.extend(['## Statistics', '\n', '\n'.join(badges), ''])
    stats_lines = [
        '- **Varieties:** {0:,}'.format(len(totals['lids'])),
        '- **Concepts:** {0:,}'.format(len(totals['cids'])),
        '- **Lexemes:** {0:,}'.format(totals['lexemes']),
        '- **Sources:** {0:,}'.format(len(totals['sids'])),
        '- **Synonymy:** {:0.2f}'.format(totals['SI']),
    ]
    if num_cognates:
        stats_lines.extend([
            '- **Cognacy:** {0:,} cognates in {1:,} cognate sets ({2:,} singletons)'
            .format(
                sum(v for k, v in totals['cognate_sets'].items()),
                num_cognates,
                len([k for k, v in totals['cognate_sets'].items() if v == 1])),
            '- **Cognate Diversity:** {:0.2f}'.format(cog_diversity)
        ])
    if stats['segments']:
        stats_lines.extend([
            '- **Invalid lexemes:** {0:,}'.format(
                stats['invalid_words_count']),
            '- **Tokens:** {0:,}'.format(sum(stats['segments'].values())),
            '- **Segments:** {0:,} ({1} BIPA errors, {2} CTLS sound class errors, '
            '{3} CLTS modified)'.format(lsegments, lbipapyerr, lsclasserr,
                                        len(stats['replacements'])),
            '- **Inventory size (avg):** {:0.2f}'.format(
                stats['inventory_size']),
        ])

    if log:
        log.info('\n'.join(
            ['Summary for dataset {}'.format(cldf_spec.metadata_path)] +
            stats_lines))
    lines.extend(stats_lines)

    totals['languages'] = len(totals['lids'])
    totals['concepts'] = len(totals['cids'])
    totals['cognate_sets'] = bool(1 for k, v in totals['cognate_sets'].items()
                                  if v > 1)

    bookkeeping_languoids_in_gl = set()
    if glottolog:
        for l in glottolog.api.languoids():
            if l.category == 'Bookkeeping':
                bookkeeping_languoids_in_gl.add(l.id)  # pragma: no cover

    bookkeeping_languoids = []
    for lang in cldf['LanguageTable']:
        if lang.get('Glottocode') in bookkeeping_languoids_in_gl:
            bookkeeping_languoids.append(lang)  # pragma: no cover

    # improvements section
    if missing_lang or missing_source or bookkeeping_languoids:
        lines.extend([
            '\n## Possible Improvements:\n',
        ])

        if missing_lang:  # pragma: no cover
            lines.append("- Languages missing glottocodes: %d/%d (%.2f%%)" %
                         (len(missing_lang), totals['languages'],
                          (len(missing_lang) / totals['languages']) * 100))

        if bookkeeping_languoids:  # pragma: no cover
            lines.append(
                "- Languages linked to [bookkeeping languoids in Glottolog]"
                "(http://glottolog.org/glottolog/glottologinformation"
                "#bookkeepinglanguoids):")
        for lang in bookkeeping_languoids:  # pragma: no cover
            lines.append(
                '  - {0} [{1}](http://glottolog.org/resource/languoid/id/{1})'.
                format(lang.get('Name', lang.get('ID')), lang['Glottocode']))
        lines.append('\n')

    if missing_source:
        lines.append("- Entries missing sources: %d/%d (%.2f%%)" %
                     (len(missing_source), totals['lexemes'],
                      (len(missing_source) / totals['lexemes']) * 100))

    return lines