Ejemplo n.º 1
0
class Conceptlist(Bag):
    _api = attr.ib()
    id = attr.ib()
    author = attr.ib()
    year = attr.ib(convert=int)
    list_suffix = attr.ib()
    items = attr.ib(convert=int)
    tags = attr.ib(convert=split_ids, validator=valid_key)
    source_language = attr.ib(convert=lambda v: split(v.lower()))
    target_language = attr.ib()
    url = attr.ib()
    refs = attr.ib(convert=split_ids)
    pdf = attr.ib(convert=split_ids)
    note = attr.ib()
    pages = attr.ib()
    alias = attr.ib(convert=split)

    @property
    def path(self):
        if isinstance(self._api, Path):
            return self._api
        return self._api.data_path('conceptlists', self.id + '.tsv')

    @cached_property()
    def attributes(self):
        header = []
        if self.path.exists():
            with self.path.open(encoding='utf8') as fp:
                header = fp.readline().strip().split('\t')
        return [h for h in header if h.lower() not in Concept.public_fields()]

    @cached_property()
    def concepts(self):
        res = []
        if self.path.exists():
            for item in read_dicts(self.path):
                kw, attributes = {}, {}
                for k, v in item.items():
                    if k:
                        kl = k.lower()
                        setitem(
                            kw if kl in Concept.public_fields() else
                            attributes, kl, v)
                res.append(Concept(list=self, attributes=attributes, **kw))
        return to_dict(res)

    @classmethod
    def from_file(cls, path, **keywords):
        """
        Function loads a concept list outside the Concepticon collection.
        """
        path = Path(path)
        assert path.exists()
        attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()}
        attrs.update(id=path.stem,
                     items=keywords.get('items', len(read_dicts(path))),
                     year=keywords.get('year', 0))
        return cls(api=path, **attrs)
Ejemplo n.º 2
0
class Conceptlist(Bag):
    _api = attr.ib()
    id = attr.ib(validator=valid_conceptlist_id)
    author = attr.ib()
    year = attr.ib(converter=partial(valid_int, 'YEAR'))
    list_suffix = attr.ib()
    items = attr.ib(converter=partial(valid_int, 'ITEMS'))
    tags = attr.ib(converter=split_ids, validator=valid_key)
    source_language = attr.ib(converter=lambda v: split(v.lower()))
    target_language = attr.ib()
    url = attr.ib()
    refs = attr.ib(converter=split_ids)
    pdf = attr.ib(converter=split_ids)
    note = attr.ib()
    pages = attr.ib()
    alias = attr.ib(converter=lambda s: [] if s is None else split(s))
    local = attr.ib(default=False)

    @lazyproperty
    def tg(self):
        md = self.path.parent.joinpath(self.path.name + MD_SUFFIX)
        if not md.exists():
            if hasattr(self._api, 'repos'):
                ddir = self._api.path('concepticondata')
                if self.local:
                    md = ddir.joinpath('conceptlists', 'local' + MD_SUFFIX)
                if not md.exists():
                    md = ddir.joinpath('conceptlists', 'default' + MD_SUFFIX)
            else:
                md = Path(__file__).parent / 'conceptlist-metadata.json'
        tg = TableGroup.from_file(md)
        if isinstance(self._api, Path):
            tg._fname = self._api.parent.joinpath(self._api.name + MD_SUFFIX)
        tg.tables[0].url = Link('{0}.tsv'.format(self.id))
        return tg

    @lazyproperty
    def metadata(self):
        return self.tg.tables[0]

    @property
    def path(self):
        if isinstance(self._api, Path):
            return self._api
        return self._api.data_path('conceptlists', self.id + '.tsv')

    @lazyproperty
    def cols_in_list(self):
        return list(next(reader(self.path, dicts=True, delimiter='\t')).keys())

    @lazyproperty
    def attributes(self):
        return [
            c.name for c in self.metadata.tableSchema.columns
            if c.name.lower() not in Concept.public_fields()
        ]

    @lazyproperty
    def concepts(self):
        res = []
        if self.path.exists():
            for item in self.metadata:
                kw, attributes = {}, {}
                for k, v in item.items():
                    if k:
                        kl = k.lower()
                        setitem(
                            kw if kl in Concept.public_fields() else
                            attributes, kl, v)
                res.append(Concept(list=self, attributes=attributes, **kw))
        return to_dict(res)

    @classmethod
    def from_file(cls, path, **keywords):
        """
        Function loads a concept list outside the Concepticon collection.

        @todo: uniqueness-check hier einbauen, siehe Funktion read_dicts
        """
        path = Path(path)
        assert path.exists()
        attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()}
        attrs.update(id=path.stem,
                     items=keywords.get('items', len(read_dicts(path))),
                     year=keywords.get('year', 0),
                     local=True)
        return cls(api=path, **attrs)
Ejemplo n.º 3
0
def check(api=None):
    if not api:
        if not REPOS_PATH.exists():
            return  # pragma: no cover
        api = Concepticon(REPOS_PATH)

    # We collect all cite keys used to refer to references.
    all_refs = set()
    for meta in api.metadata.values():
        cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns'])
        cnames_tsv = set(list(meta.values.values())[0])
        if cnames_tsv - cnames_schema:  # pragma: no cover
            error('column names in {0} but not in json-specs'.format(meta.id), 'name')
        for i, value in enumerate(meta.values.values()):
            if set(value.keys()) != cnames_schema:  # pragma: no cover
                error('meta data {0} contains irregular number of columns in line {1}'
                      .format(meta.id, i + 2), 'name')
        for ref in split(meta.meta.get('dc:references') or ''):
            all_refs.add(ref)

    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    for i, cl in enumerate(api.conceptlists.values()):
        for ref in cl.refs:
            if ref not in api.bibliography:  # pragma: no cover
                error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2)
            all_refs.add(ref)
        refs_in_text = re.findall(BIB_PATTERN, cl.note)
        for ref in refs_in_text:
            all_refs.add(ref)

        # make also sure that all sources are accompanied by a PDF, but only write a
        # warning if this is not the case
        for ref in cl.pdf:
            if ref not in api.sources:  # pragma: no cover
                warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv')
    all_refs.add('List2016a')

    for ref in api.bibliography:
        if ref not in all_refs:  # pragma: no cover
            error('unused bibtex record: {0}'.format(ref), 'references.bib')

    ref_cols = {
        'concepticon_id': set(api.conceptsets.keys()),
        'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()),
    }

    for i, rel in enumerate(api.relations.raw):
        for attr, type_ in [
            ('SOURCE', 'concepticon_id'),
            ('TARGET', 'concepticon_id'),
            ('SOURCE_GLOSS', 'concepticon_gloss'),
            ('TARGET_GLOSS', 'concepticon_gloss'),
        ]:
            if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                error(
                    'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2)

    for fname in api.data_path('conceptlists').glob('*.tsv'):
        if fname.stem not in api.conceptlists:  # pragma: no cover
            error(
                'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '')

    for cl in api.conceptlists.values():
        for i, concept in enumerate(cl.concepts.values()):
            if i == 0:  # pragma: no cover
                for lg in cl.source_language:
                    if lg.lower() not in concept.cols:
                        error('missing source language col %s' % lg.upper(), cl.id)

            for lg in cl.source_language:  # pragma: no cover
                if not (concept.attributes.get(lg.lower()) or
                        getattr(concept, lg.lower(), None) or
                        (lg.lower() == 'english' and not concept.gloss)):
                    error('missing source language translation %s' % lg, cl.id, i + 2)
            for attr, values in ref_cols.items():
                val = getattr(concept, attr)
                if val and val not in values:  # pragma: no cover
                    error('invalid value for %s: %s' % (attr, val), cl.id, i + 2)

    sameas = {}
    glosses = set()
    for cs in api.conceptsets.values():
        if cs.gloss in glosses:  # pragma: no cover
            error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id)
        glosses.add(cs.gloss)
        for target, rel in cs.relations.items():
            if rel == 'sameas':
                for group in sameas.values():
                    if target in group:  # pragma: no cover
                        group.add(cs.id)
                        break
                else:
                    sameas[cs.gloss] = {cs.id, target}

    deprecated = {}
    for s in sameas.values():
        csids = sorted(s, key=lambda j: int(j))
        for csid in csids[1:]:
            assert csid not in deprecated
            deprecated[csid] = csids[0]

    for cl in api.conceptlists.values():
        for concept in cl.concepts.values():
            if concept.concepticon_id in deprecated:  # pragma: no cover
                error('deprecated concept set {0} linked for {1}'.format(
                    concept.concepticon_id, concept.id), cl.id)

    return SUCCESS
Ejemplo n.º 4
0
    def check(self, *clids):
        errors = []
        assert self.retirements
        print('testing {0} concept lists'.format(
            len(clids) if clids else len(self.conceptlists)))

        def _msg(type_, msg, name, line):  # pragma: no cover
            if line:
                line = ':%s' % line
            return '%s:%s%s: %s' % (type_.upper(), name, line or '', msg)

        def error(msg, name, line=0):  # pragma: no cover
            errors.append((msg, name, line))

        def warning(msg, name, line=0):  # pragma: no cover
            warnings.warn(_msg('warning', msg, name, line), Warning)

        for i, d in enumerate(self.conceptlists_dicts, start=1):
            if (not clids) or d['ID'] in clids:
                try:
                    Conceptlist(api=self, **lowercase(d))
                except ValueError as e:  # pragma: no cover
                    error(str(e), 'conceptlists.tsv', i)

        def exit():
            for msg, name, line in errors:
                print(_msg('error', msg, name, line))
            return not bool(errors)

        if errors:  # pragma: no cover
            return exit()  # Exit early in case of structural errors.

        REF_WITHOUT_LABEL_PATTERN = re.compile(
            r'[^]]\(:(ref|bib):[A-Za-z0-9\-]+\)')
        REF_WITHOUT_LINK_PATTERN = re.compile('[^(]:(ref|bib):[A-Za-z0-9-]+')

        # Make sure all language-specific mappings are well specified
        iso_langs = [
            lang.iso2 for lang in self.vocabularies['COLUMN_TYPES'].values()
            if isinstance(lang, Languoid) and lang.iso2
        ]
        if len(iso_langs) != len(set(iso_langs)):
            error(
                'Duplicate ISO codes: {}'.format(
                    collections.Counter(iso_langs).most_common(1)),
                'concepticon.json')
        assert set(p.stem.split('-')[1] for p in self.path('mappings').glob('map-*.tsv'))\
            .issubset(iso_langs)

        # We collect all cite keys used to refer to references.
        all_refs = set()
        refs_in_bib = set(ref for ref in self.bibliography)
        for meta in self.metadata.values():
            cnames_schema = set(var['name']
                                for var in meta.meta['tableSchema']['columns'])
            cnames_tsv = set(list(meta.values.values())[0])
            if cnames_tsv - cnames_schema:  # pragma: no cover
                error(
                    'column names in {0} but not in json-specs'.format(
                        meta.id), 'name')
            for i, (key, value) in enumerate(meta.values.items()):
                if set(value.keys()) != cnames_schema:  # pragma: no cover
                    error(
                        'meta data {0} contains irregular number of columns in line {1}'
                        .format(meta.id, i + 2), 'name')
                if key not in self.conceptsets:  # pragma: no cover
                    error(
                        'meta data {0} references invalid CONCEPTICON_ID {2} in line {1}'
                        .format(meta.id, i + 2, key), 'name')
            for ref in split(meta.meta.get('dc:references') or ''):
                if ref not in refs_in_bib:  # pragma: no cover
                    error('cited bibtex record not in bib: {0}'.format(ref),
                          'name')
                all_refs.add(ref)

        # Make sure only records in the BibTeX file references.bib are referenced by
        # concept lists.
        for i, cl in enumerate(self.conceptlists.values()):
            if clids and cl.id not in clids:
                continue  # pragma: no cover
            fl = ('conceptlists.tsv', i + 2)
            for ref in re.findall(BIB_PATTERN, cl.note) + cl.refs:
                if ref not in refs_in_bib:
                    error('cited bibtex record not in bib: {0}'.format(ref),
                          *fl)
                else:
                    all_refs.add(ref)

            for m in REF_WITHOUT_LABEL_PATTERN.finditer(cl.note):
                error(
                    'link without label: {0}'.format(
                        m.string[m.start():m.end()]), *fl)

            for m in REF_WITHOUT_LINK_PATTERN.finditer(
                    cl.note):  # pragma: no cover
                error(
                    'reference not in link: {0}'.format(
                        m.string[m.start():m.end()]), *fl)

            for m in REF_PATTERN.finditer(cl.note):
                if m.group('id') not in self.conceptlists:  # pragma: no cover
                    error('invalid conceptlist ref: {0}'.format(m.group('id')),
                          *fl)

            # make also sure that all sources are accompanied by a PDF, but only write a
            # warning if this is not the case
            for ref in cl.pdf:
                if ref not in self.sources:  # pragma: no cover
                    warning('no PDF found for {0}'.format(ref),
                            'conceptlists.tsv')
        all_refs.add('List2016a')

        if not clids:
            # Only report unused references if we check all concept lists!
            for ref in refs_in_bib - all_refs:  # pragma: no cover
                error('unused bibtex record: {0}'.format(ref),
                      'references.bib')

        ref_cols = {
            'concepticon_id': set(self.conceptsets.keys()),
            'concepticon_gloss':
            set(cs.gloss for cs in self.conceptsets.values()),
        }

        for i, rel in enumerate(self.relations.raw):
            for attr, type_ in [
                ('SOURCE', 'concepticon_id'),
                ('TARGET', 'concepticon_id'),
                ('SOURCE_GLOSS', 'concepticon_gloss'),
                ('TARGET_GLOSS', 'concepticon_gloss'),
            ]:
                if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                    error('invalid {0}: {1}'.format(attr, rel[attr]),
                          'conceptrelations', i + 2)

        for fname in self.data_path('conceptlists').glob('*.tsv'):
            if clids and fname.stem not in clids:
                continue  # pragma: no cover
            if fname.stem not in self.conceptlists:  # pragma: no cover
                error(
                    'conceptlist missing in conceptlists.tsv: {0}'.format(
                        fname.name), '')

        broken_cls = []

        for cl in self.conceptlists.values():
            if clids and cl.id not in clids:
                continue  # pragma: no cover
            #
            # Check consistency between the csvw metadata and the column names in the list.
            #
            missing_in_md, missing_in_list = [], []
            cols_in_md = []
            for col in cl.metadata.tableSchema.columns:
                cnames = [
                ]  # all names or aliases csvw will recognize for this column
                if col.name in cols_in_md:  # pragma: no cover
                    error(
                        'Duplicate name ot title in table schema: {0}'.format(
                            col.name), cl.id)
                cnames.append(col.name)
                if col.titles:
                    c = col.titles.getfirst()
                    if c in cols_in_md:  # pragma: no cover
                        error(
                            'Duplicate name ot title in table schema: {0}'.
                            format(c), cl.id)
                    cnames.append(c)
                cols_in_md.extend(cnames)
                if not any(name in cl.cols_in_list for name in cnames):
                    # Neither name nor title of the column is in the actual list header.
                    missing_in_list.append(col.name)
            for col in cl.cols_in_list:
                if col not in cols_in_md:
                    missing_in_md.append(col)

            for col in missing_in_list:
                error(
                    'Column in metadata but missing in list: {0}'.format(col),
                    cl.id)
            for col in missing_in_md:
                error(
                    'Column in list but missing in metadata: {0}'.format(col),
                    cl.id)

            try:
                # Now check individual concepts:
                for i, concept in enumerate(cl.concepts.values()):
                    if not concept.id.startswith(cl.id):  # pragma: no cover
                        error(
                            'concept ID does not match concept list ID pattern %s'
                            % concept.id, cl.id)

                    if concept.concepticon_id:
                        cs = self.conceptsets.get(concept.concepticon_id)
                        if not cs:  # pragma: no cover
                            error(
                                'invalid conceptset ID %s' %
                                concept.concepticon_id, cl.id)
                        elif cs.gloss != concept.concepticon_gloss:  # pragma: no cover
                            error(
                                'wrong conceptset GLOSS for ID {0}: {1} -> {2}'
                                .format(cs.id, concept.concepticon_gloss,
                                        cs.gloss), cl.id)

                    if i == 0:  # pragma: no cover
                        for lg in cl.source_language:
                            if lg.lower() not in concept.cols:
                                error(
                                    'missing source language col %s' %
                                    lg.upper(), cl.id)

                    for lg in cl.source_language:  # pragma: no cover
                        if not (concept.attributes.get(lg.lower())
                                or getattr(concept, lg.lower(), None) or
                                (lg.lower() == 'english'
                                 and not concept.gloss)):
                            error(
                                'missing source language translation %s' % lg,
                                cl.id, i + 2)
                    for attr, values in ref_cols.items():
                        val = getattr(concept, attr)
                        if val:
                            # check that there are not leading and trailing spaces
                            # (while computationally expensive, this helps catch really
                            # hard to find typos)
                            if val != val.strip():  # pragma: no cover
                                error(
                                    "leading or trailing spaces in value for %s: '%s'"
                                    % (attr, val), cl.id, i + 2)

                            if val not in values:  # pragma: no cover
                                error('invalid value for %s: %s' % (attr, val),
                                      cl.id, i + 2)
            except TypeError as e:  # pragma: no cover
                broken_cls.append(cl.id)
                error(str(e), cl.id)

        sameas = {}
        glosses = set()
        for cs in self.conceptsets.values():
            if cs.gloss in glosses:  # pragma: no cover
                error('duplicate conceptset gloss: {0}'.format(cs.gloss),
                      cs.id)
            glosses.add(cs.gloss)
            for target, rel in cs.relations.items():
                if rel == 'sameas':
                    for group in sameas.values():
                        if target in group:  # pragma: no cover
                            group.add(cs.id)
                            break
                    else:
                        sameas[cs.gloss] = {cs.id, target}

        deprecated = {}
        for s in sameas.values():
            csids = sorted(s, key=lambda j: int(j))
            for csid in csids[1:]:
                assert csid not in deprecated
                deprecated[csid] = csids[0]

        for cl in self.conceptlists.values():
            if cl.id in broken_cls:
                continue  # pragma: no cover
            for concept in cl.concepts.values():
                if concept.concepticon_id in deprecated:  # pragma: no cover
                    error(
                        'deprecated concept set {0} linked for {1}'.format(
                            concept.concepticon_id, concept.id), cl.id)

        return exit()
Ejemplo n.º 5
0
def test():
    if not REPOS_PATH.exists():
        return  # pragma: no cover

    api = Concepticon(REPOS_PATH)

    # We collect all cite keys used to refer to references.
    all_refs = set()
    for meta in api.metadata.values():
        cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns'])
        cnames_tsv = set(list(meta.values.values())[0])
        if cnames_tsv - cnames_schema:  # pragma: no cover
            error('column names in {0} but not in json-specs'.format(meta.id), 'name')
        for i, value in enumerate(meta.values.values()):
            if set(value.keys()) != cnames_schema:  # pragma: no cover
                error('meta data {0} contains irregular number of columns in line {1}'
                      .format(meta.id, i + 2), 'name')
        for ref in split(meta.meta.get('dc:references') or ''):
            all_refs.add(ref)

    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    for i, cl in enumerate(api.conceptlists.values()):
        for ref in cl.refs:
            if ref not in api.bibliography:  # pragma: no cover
                error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2)
            all_refs.add(ref)
        refs_in_text = re.findall(BIB_PATTERN, cl.note)
        for ref in refs_in_text:
            all_refs.add(ref)

        # make also sure that all sources are accompanied by a PDF, but only write a
        # warning if this is not the case
        for ref in cl.pdf:
            if ref not in api.sources:  # pragma: no cover
                warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv')

    for ref in api.bibliography:
        if ref not in all_refs:  # pragma: no cover
            error('unused bibtex record: {0}'.format(ref), 'references.bib')

    ref_cols = {
        'concepticon_id': set(api.conceptsets.keys()),
        'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()),
    }

    for i, rel in enumerate(api.relations.raw):
        for attr, type_ in [
            ('SOURCE', 'concepticon_id'),
            ('TARGET', 'concepticon_id'),
            ('SOURCE_GLOSS', 'concepticon_gloss'),
            ('TARGET_GLOSS', 'concepticon_gloss'),
        ]:
            if rel[attr] not in ref_cols[type_]:  # pragma: no cover
                error(
                    'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2)

    for fname in api.data_path('conceptlists').glob('*.tsv'):
        if fname.stem not in api.conceptlists:  # pragma: no cover
            error(
                'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '')

    for cl in api.conceptlists.values():
        for i, concept in enumerate(cl.concepts.values()):
            if i == 0:  # pragma: no cover
                for lg in cl.source_language:
                    if lg.lower() not in concept.cols:
                        error('missing source language col %s' % lg.upper(), cl.id)

            for lg in cl.source_language:  # pragma: no cover
                if not (concept.attributes.get(lg.lower()) or
                        getattr(concept, lg.lower(), None)):
                    error('missing source language translation %s' % lg, cl.id, i + 2)
            for attr, values in ref_cols.items():
                val = getattr(concept, attr)
                if val and val not in values:  # pragma: no cover
                    error('invalid value for %s: %s' % (attr, val), cl.id, i + 2)

    if not SUCCESS:  # pragma: no cover
        raise ValueError('integrity checks failed!')