コード例 #1
0
ファイル: api.py プロジェクト: chirila/concepticon-data
 def conceptsets(self):
     """
     :returns: `dict` mapping ConceptSet IDs to `Conceptset` instances.
     """
     return to_dict(
         Conceptset(api=self, **lowercase(d))
         for d in read_dicts(self.data_path('concepticon.tsv')))
コード例 #2
0
ファイル: api.py プロジェクト: chirila/concepticon-data
    def conceptlists(self):
        """
        :returns: `dict` mapping ConceptList IDs to `Conceptlist` instances.

        .. note:: Individual concepts can be accessed via `Conceptlist.concepts`.
        """
        return to_dict(
            Conceptlist(api=self, **lowercase(d))
            for d in read_dicts(self.data_path('conceptlists.tsv')))
コード例 #3
0
ファイル: api.py プロジェクト: concepticon/pyconcepticon
    def check(self, *clids):
        errors = []
        assert self.retirements
        print('testing {0} concept lists'.format(
            len(clids) if clids else len(self.conceptlists)))

        def _msg(type_, msg, name, line):  # pragma: no cover
            if line:
                line = ':%s' % line
            return '%s:%s%s: %s' % (type_.upper(), name, line or '', msg)

        def error(msg, name, line=0):  # pragma: no cover
            errors.append((msg, name, line))

        def warning(msg, name, line=0):  # pragma: no cover
            warnings.warn(_msg('warning', msg, name, line), Warning)

        for i, d in enumerate(self.conceptlists_dicts, start=1):
            if (not clids) or d['ID'] in clids:
                try:
                    Conceptlist(api=self, **lowercase(d))
                except ValueError as e:  # pragma: no cover
                    error(str(e), 'conceptlists.tsv', i)

        def exit():
            for msg, name, line in errors:
                print(_msg('error', msg, name, line))
            return not bool(errors)

        if errors:  # pragma: no cover
            return exit()  # Exit early in case of structural errors.

        REF_WITHOUT_LABEL_PATTERN = re.compile(
            r'[^]]\(:(ref|bib):[A-Za-z0-9\-]+\)')
        REF_WITHOUT_LINK_PATTERN = re.compile('[^(]:(ref|bib):[A-Za-z0-9-]+')

        # Make sure all language-specific mappings are well specified
        iso_langs = [
            lang.iso2 for lang in self.vocabularies['COLUMN_TYPES'].values()
            if isinstance(lang, Languoid) and lang.iso2
        ]
        if len(iso_langs) != len(set(iso_langs)):
            error(
                'Duplicate ISO codes: {}'.format(
                    collections.Counter(iso_langs).most_common(1)),
                'concepticon.json')
        assert set(p.stem.split('-')[1] for p in self.path('mappings').glob('map-*.tsv'))\
            .issubset(iso_langs)

        # We collect all cite keys used to refer to references.
        all_refs = set()
        refs_in_bib = set(ref for ref in self.bibliography)
        for meta in self.metadata.values():
            cnames_schema = set(var['name']
                                for var in meta.meta['tableSchema']['columns'])
            cnames_tsv = set(list(meta.values.values())[0])
            if cnames_tsv - cnames_schema:  # pragma: no cover
                error(
                    'column names in {0} but not in json-specs'.format(
                        meta.id), 'name')
            for i, (key, value) in enumerate(meta.values.items()):
                if set(value.keys()) != cnames_schema:  # pragma: no cover
                    error(
                        'meta data {0} contains irregular number of columns in line {1}'
                        .format(meta.id, i + 2), 'name')
                if key not in self.conceptsets:  # pragma: no cover
                    error(
                        'meta data {0} references invalid CONCEPTICON_ID {2} in line {1}'
                        .format(meta.id, i + 2, key), 'name')
            for ref in split(meta.meta.get('dc:references') or ''):
                if ref not in refs_in_bib:  # pragma: no cover
                    error('cited bibtex record not in bib: {0}'.format(ref),
                          'name')
                all_refs.add(ref)

        # Make sure only records in the BibTeX file references.bib are referenced by
        # concept lists.
        for i, cl in enumerate(self.conceptlists.values()):
            if clids and cl.id not in clids:
                continue  # pragma: no cover
            fl = ('conceptlists.tsv', i + 2)
            for ref in re.findall(BIB_PATTERN, cl.note) + cl.refs:
                if ref not in refs_in_bib:
                    error('cited bibtex record not in bib: {0}'.format(ref),
                          *fl)
                else:
                    all_refs.add(ref)

            for m in REF_WITHOUT_LABEL_PATTERN.finditer(cl.note):
                error(
                    'link without label: {0}'.format(
                        m.string[m.start():m.end()]), *fl)

            for m in REF_WITHOUT_LINK_PATTERN.finditer(
                    cl.note):  # pragma: no cover
                error(
                    'reference not in link: {0}'.format(
                        m.string[m.start():m.end()]), *fl)

            for m in REF_PATTERN.finditer(cl.note):
                if m.group('id') not in self.conceptlists:  # pragma: no cover
                    error('invalid conceptlist ref: {0}'.format(m.group('id')),
                          *fl)

            # make also sure that all sources are accompanied by a PDF, but only write a
            # warning if this is not the case
            for ref in cl.pdf:
                if ref not in self.sources:  # pragma: no cover
                    warning('no PDF found for {0}'.format(ref),
                            'conceptlists.tsv')
        all_refs.add('List2016a')

        if not clids:
            # Only report unused references if we check all concept lists!
            for ref in refs_in_bib - all_refs:  # pragma: no cover
                error('unused bibtex record: {0}'.format(ref),
                      'references.bib')

        ref_cols = {
            'concepticon_id': set(self.conceptsets.keys()),
            'concepticon_gloss':
            set(cs.gloss for cs in self.conceptsets.values()),
        }

        for i, rel in enumerate(self.relations.raw):
            for attr, type_ in [
                ('SOURCE', 'concepticon_id'),
                ('TARGET', 'concepticon_id'),
                ('SOURCE_GLOSS', 'concepticon_gloss'),
                ('TARGET_GLOSS', 'concepticon_gloss'),
            ]:
                if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                    error('invalid {0}: {1}'.format(attr, rel[attr]),
                          'conceptrelations', i + 2)

        for fname in self.data_path('conceptlists').glob('*.tsv'):
            if clids and fname.stem not in clids:
                continue  # pragma: no cover
            if fname.stem not in self.conceptlists:  # pragma: no cover
                error(
                    'conceptlist missing in conceptlists.tsv: {0}'.format(
                        fname.name), '')

        broken_cls = []

        for cl in self.conceptlists.values():
            if clids and cl.id not in clids:
                continue  # pragma: no cover
            #
            # Check consistency between the csvw metadata and the column names in the list.
            #
            missing_in_md, missing_in_list = [], []
            cols_in_md = []
            for col in cl.metadata.tableSchema.columns:
                cnames = [
                ]  # all names or aliases csvw will recognize for this column
                if col.name in cols_in_md:  # pragma: no cover
                    error(
                        'Duplicate name ot title in table schema: {0}'.format(
                            col.name), cl.id)
                cnames.append(col.name)
                if col.titles:
                    c = col.titles.getfirst()
                    if c in cols_in_md:  # pragma: no cover
                        error(
                            'Duplicate name ot title in table schema: {0}'.
                            format(c), cl.id)
                    cnames.append(c)
                cols_in_md.extend(cnames)
                if not any(name in cl.cols_in_list for name in cnames):
                    # Neither name nor title of the column is in the actual list header.
                    missing_in_list.append(col.name)
            for col in cl.cols_in_list:
                if col not in cols_in_md:
                    missing_in_md.append(col)

            for col in missing_in_list:
                error(
                    'Column in metadata but missing in list: {0}'.format(col),
                    cl.id)
            for col in missing_in_md:
                error(
                    'Column in list but missing in metadata: {0}'.format(col),
                    cl.id)

            try:
                # Now check individual concepts:
                for i, concept in enumerate(cl.concepts.values()):
                    if not concept.id.startswith(cl.id):  # pragma: no cover
                        error(
                            'concept ID does not match concept list ID pattern %s'
                            % concept.id, cl.id)

                    if concept.concepticon_id:
                        cs = self.conceptsets.get(concept.concepticon_id)
                        if not cs:  # pragma: no cover
                            error(
                                'invalid conceptset ID %s' %
                                concept.concepticon_id, cl.id)
                        elif cs.gloss != concept.concepticon_gloss:  # pragma: no cover
                            error(
                                'wrong conceptset GLOSS for ID {0}: {1} -> {2}'
                                .format(cs.id, concept.concepticon_gloss,
                                        cs.gloss), cl.id)

                    if i == 0:  # pragma: no cover
                        for lg in cl.source_language:
                            if lg.lower() not in concept.cols:
                                error(
                                    'missing source language col %s' %
                                    lg.upper(), cl.id)

                    for lg in cl.source_language:  # pragma: no cover
                        if not (concept.attributes.get(lg.lower())
                                or getattr(concept, lg.lower(), None) or
                                (lg.lower() == 'english'
                                 and not concept.gloss)):
                            error(
                                'missing source language translation %s' % lg,
                                cl.id, i + 2)
                    for attr, values in ref_cols.items():
                        val = getattr(concept, attr)
                        if val:
                            # check that there are not leading and trailing spaces
                            # (while computationally expensive, this helps catch really
                            # hard to find typos)
                            if val != val.strip():  # pragma: no cover
                                error(
                                    "leading or trailing spaces in value for %s: '%s'"
                                    % (attr, val), cl.id, i + 2)

                            if val not in values:  # pragma: no cover
                                error('invalid value for %s: %s' % (attr, val),
                                      cl.id, i + 2)
            except TypeError as e:  # pragma: no cover
                broken_cls.append(cl.id)
                error(str(e), cl.id)

        sameas = {}
        glosses = set()
        for cs in self.conceptsets.values():
            if cs.gloss in glosses:  # pragma: no cover
                error('duplicate conceptset gloss: {0}'.format(cs.gloss),
                      cs.id)
            glosses.add(cs.gloss)
            for target, rel in cs.relations.items():
                if rel == 'sameas':
                    for group in sameas.values():
                        if target in group:  # pragma: no cover
                            group.add(cs.id)
                            break
                    else:
                        sameas[cs.gloss] = {cs.id, target}

        deprecated = {}
        for s in sameas.values():
            csids = sorted(s, key=lambda j: int(j))
            for csid in csids[1:]:
                assert csid not in deprecated
                deprecated[csid] = csids[0]

        for cl in self.conceptlists.values():
            if cl.id in broken_cls:
                continue  # pragma: no cover
            for concept in cl.concepts.values():
                if concept.concepticon_id in deprecated:  # pragma: no cover
                    error(
                        'deprecated concept set {0} linked for {1}'.format(
                            concept.concepticon_id, concept.id), cl.id)

        return exit()