class Conceptlist(Bag): _api = attr.ib() id = attr.ib() author = attr.ib() year = attr.ib(convert=int) list_suffix = attr.ib() items = attr.ib(convert=int) tags = attr.ib(convert=split_ids, validator=valid_key) source_language = attr.ib(convert=lambda v: split(v.lower())) target_language = attr.ib() url = attr.ib() refs = attr.ib(convert=split_ids) pdf = attr.ib(convert=split_ids) note = attr.ib() pages = attr.ib() alias = attr.ib(convert=split) @property def path(self): if isinstance(self._api, Path): return self._api return self._api.data_path('conceptlists', self.id + '.tsv') @cached_property() def attributes(self): header = [] if self.path.exists(): with self.path.open(encoding='utf8') as fp: header = fp.readline().strip().split('\t') return [h for h in header if h.lower() not in Concept.public_fields()] @cached_property() def concepts(self): res = [] if self.path.exists(): for item in read_dicts(self.path): kw, attributes = {}, {} for k, v in item.items(): if k: kl = k.lower() setitem( kw if kl in Concept.public_fields() else attributes, kl, v) res.append(Concept(list=self, attributes=attributes, **kw)) return to_dict(res) @classmethod def from_file(cls, path, **keywords): """ Function loads a concept list outside the Concepticon collection. """ path = Path(path) assert path.exists() attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()} attrs.update(id=path.stem, items=keywords.get('items', len(read_dicts(path))), year=keywords.get('year', 0)) return cls(api=path, **attrs)
class Conceptlist(Bag): _api = attr.ib() id = attr.ib(validator=valid_conceptlist_id) author = attr.ib() year = attr.ib(converter=partial(valid_int, 'YEAR')) list_suffix = attr.ib() items = attr.ib(converter=partial(valid_int, 'ITEMS')) tags = attr.ib(converter=split_ids, validator=valid_key) source_language = attr.ib(converter=lambda v: split(v.lower())) target_language = attr.ib() url = attr.ib() refs = attr.ib(converter=split_ids) pdf = attr.ib(converter=split_ids) note = attr.ib() pages = attr.ib() alias = attr.ib(converter=lambda s: [] if s is None else split(s)) local = attr.ib(default=False) @lazyproperty def tg(self): md = self.path.parent.joinpath(self.path.name + MD_SUFFIX) if not md.exists(): if hasattr(self._api, 'repos'): ddir = self._api.path('concepticondata') if self.local: md = ddir.joinpath('conceptlists', 'local' + MD_SUFFIX) if not md.exists(): md = ddir.joinpath('conceptlists', 'default' + MD_SUFFIX) else: md = Path(__file__).parent / 'conceptlist-metadata.json' tg = TableGroup.from_file(md) if isinstance(self._api, Path): tg._fname = self._api.parent.joinpath(self._api.name + MD_SUFFIX) tg.tables[0].url = Link('{0}.tsv'.format(self.id)) return tg @lazyproperty def metadata(self): return self.tg.tables[0] @property def path(self): if isinstance(self._api, Path): return self._api return self._api.data_path('conceptlists', self.id + '.tsv') @lazyproperty def cols_in_list(self): return list(next(reader(self.path, dicts=True, delimiter='\t')).keys()) @lazyproperty def attributes(self): return [ c.name for c in self.metadata.tableSchema.columns if c.name.lower() not in Concept.public_fields() ] @lazyproperty def concepts(self): res = [] if self.path.exists(): for item in self.metadata: kw, attributes = {}, {} for k, v in item.items(): if k: kl = k.lower() setitem( kw if kl in Concept.public_fields() else attributes, kl, v) res.append(Concept(list=self, attributes=attributes, **kw)) return to_dict(res) @classmethod def from_file(cls, path, **keywords): """ Function loads a concept list outside the Concepticon collection. @todo: uniqueness-check hier einbauen, siehe Funktion read_dicts """ path = Path(path) assert path.exists() attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()} attrs.update(id=path.stem, items=keywords.get('items', len(read_dicts(path))), year=keywords.get('year', 0), local=True) return cls(api=path, **attrs)
def check(api=None): if not api: if not REPOS_PATH.exists(): return # pragma: no cover api = Concepticon(REPOS_PATH) # We collect all cite keys used to refer to references. all_refs = set() for meta in api.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error('column names in {0} but not in json-specs'.format(meta.id), 'name') for i, value in enumerate(meta.values.values()): if set(value.keys()) != cnames_schema: # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') for ref in split(meta.meta.get('dc:references') or ''): all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(api.conceptlists.values()): for ref in cl.refs: if ref not in api.bibliography: # pragma: no cover error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2) all_refs.add(ref) refs_in_text = re.findall(BIB_PATTERN, cl.note) for ref in refs_in_text: all_refs.add(ref) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in api.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') all_refs.add('List2016a') for ref in api.bibliography: if ref not in all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(api.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()), } for i, rel in enumerate(api.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error( 'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in api.data_path('conceptlists').glob('*.tsv'): if fname.stem not in api.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '') for cl in api.conceptlists.values(): for i, concept in enumerate(cl.concepts.values()): if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error('missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None) or (lg.lower() == 'english' and not concept.gloss)): error('missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val and val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) sameas = {} glosses = set() for cs in api.conceptsets.values(): if cs.gloss in glosses: # pragma: no cover error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id) glosses.add(cs.gloss) for target, rel in cs.relations.items(): if rel == 'sameas': for group in sameas.values(): if target in group: # pragma: no cover group.add(cs.id) break else: sameas[cs.gloss] = {cs.id, target} deprecated = {} for s in sameas.values(): csids = sorted(s, key=lambda j: int(j)) for csid in csids[1:]: assert csid not in deprecated deprecated[csid] = csids[0] for cl in api.conceptlists.values(): for concept in cl.concepts.values(): if concept.concepticon_id in deprecated: # pragma: no cover error('deprecated concept set {0} linked for {1}'.format( concept.concepticon_id, concept.id), cl.id) return SUCCESS
def check(self, *clids): errors = [] assert self.retirements print('testing {0} concept lists'.format( len(clids) if clids else len(self.conceptlists))) def _msg(type_, msg, name, line): # pragma: no cover if line: line = ':%s' % line return '%s:%s%s: %s' % (type_.upper(), name, line or '', msg) def error(msg, name, line=0): # pragma: no cover errors.append((msg, name, line)) def warning(msg, name, line=0): # pragma: no cover warnings.warn(_msg('warning', msg, name, line), Warning) for i, d in enumerate(self.conceptlists_dicts, start=1): if (not clids) or d['ID'] in clids: try: Conceptlist(api=self, **lowercase(d)) except ValueError as e: # pragma: no cover error(str(e), 'conceptlists.tsv', i) def exit(): for msg, name, line in errors: print(_msg('error', msg, name, line)) return not bool(errors) if errors: # pragma: no cover return exit() # Exit early in case of structural errors. REF_WITHOUT_LABEL_PATTERN = re.compile( r'[^]]\(:(ref|bib):[A-Za-z0-9\-]+\)') REF_WITHOUT_LINK_PATTERN = re.compile('[^(]:(ref|bib):[A-Za-z0-9-]+') # Make sure all language-specific mappings are well specified iso_langs = [ lang.iso2 for lang in self.vocabularies['COLUMN_TYPES'].values() if isinstance(lang, Languoid) and lang.iso2 ] if len(iso_langs) != len(set(iso_langs)): error( 'Duplicate ISO codes: {}'.format( collections.Counter(iso_langs).most_common(1)), 'concepticon.json') assert set(p.stem.split('-')[1] for p in self.path('mappings').glob('map-*.tsv'))\ .issubset(iso_langs) # We collect all cite keys used to refer to references. all_refs = set() refs_in_bib = set(ref for ref in self.bibliography) for meta in self.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error( 'column names in {0} but not in json-specs'.format( meta.id), 'name') for i, (key, value) in enumerate(meta.values.items()): if set(value.keys()) != cnames_schema: # pragma: no cover error( 'meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') if key not in self.conceptsets: # pragma: no cover error( 'meta data {0} references invalid CONCEPTICON_ID {2} in line {1}' .format(meta.id, i + 2, key), 'name') for ref in split(meta.meta.get('dc:references') or ''): if ref not in refs_in_bib: # pragma: no cover error('cited bibtex record not in bib: {0}'.format(ref), 'name') all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(self.conceptlists.values()): if clids and cl.id not in clids: continue # pragma: no cover fl = ('conceptlists.tsv', i + 2) for ref in re.findall(BIB_PATTERN, cl.note) + cl.refs: if ref not in refs_in_bib: error('cited bibtex record not in bib: {0}'.format(ref), *fl) else: all_refs.add(ref) for m in REF_WITHOUT_LABEL_PATTERN.finditer(cl.note): error( 'link without label: {0}'.format( m.string[m.start():m.end()]), *fl) for m in REF_WITHOUT_LINK_PATTERN.finditer( cl.note): # pragma: no cover error( 'reference not in link: {0}'.format( m.string[m.start():m.end()]), *fl) for m in REF_PATTERN.finditer(cl.note): if m.group('id') not in self.conceptlists: # pragma: no cover error('invalid conceptlist ref: {0}'.format(m.group('id')), *fl) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in self.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') all_refs.add('List2016a') if not clids: # Only report unused references if we check all concept lists! for ref in refs_in_bib - all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(self.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in self.conceptsets.values()), } for i, rel in enumerate(self.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error('invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in self.data_path('conceptlists').glob('*.tsv'): if clids and fname.stem not in clids: continue # pragma: no cover if fname.stem not in self.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format( fname.name), '') broken_cls = [] for cl in self.conceptlists.values(): if clids and cl.id not in clids: continue # pragma: no cover # # Check consistency between the csvw metadata and the column names in the list. # missing_in_md, missing_in_list = [], [] cols_in_md = [] for col in cl.metadata.tableSchema.columns: cnames = [ ] # all names or aliases csvw will recognize for this column if col.name in cols_in_md: # pragma: no cover error( 'Duplicate name ot title in table schema: {0}'.format( col.name), cl.id) cnames.append(col.name) if col.titles: c = col.titles.getfirst() if c in cols_in_md: # pragma: no cover error( 'Duplicate name ot title in table schema: {0}'. format(c), cl.id) cnames.append(c) cols_in_md.extend(cnames) if not any(name in cl.cols_in_list for name in cnames): # Neither name nor title of the column is in the actual list header. missing_in_list.append(col.name) for col in cl.cols_in_list: if col not in cols_in_md: missing_in_md.append(col) for col in missing_in_list: error( 'Column in metadata but missing in list: {0}'.format(col), cl.id) for col in missing_in_md: error( 'Column in list but missing in metadata: {0}'.format(col), cl.id) try: # Now check individual concepts: for i, concept in enumerate(cl.concepts.values()): if not concept.id.startswith(cl.id): # pragma: no cover error( 'concept ID does not match concept list ID pattern %s' % concept.id, cl.id) if concept.concepticon_id: cs = self.conceptsets.get(concept.concepticon_id) if not cs: # pragma: no cover error( 'invalid conceptset ID %s' % concept.concepticon_id, cl.id) elif cs.gloss != concept.concepticon_gloss: # pragma: no cover error( 'wrong conceptset GLOSS for ID {0}: {1} -> {2}' .format(cs.id, concept.concepticon_gloss, cs.gloss), cl.id) if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error( 'missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None) or (lg.lower() == 'english' and not concept.gloss)): error( 'missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val: # check that there are not leading and trailing spaces # (while computationally expensive, this helps catch really # hard to find typos) if val != val.strip(): # pragma: no cover error( "leading or trailing spaces in value for %s: '%s'" % (attr, val), cl.id, i + 2) if val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) except TypeError as e: # pragma: no cover broken_cls.append(cl.id) error(str(e), cl.id) sameas = {} glosses = set() for cs in self.conceptsets.values(): if cs.gloss in glosses: # pragma: no cover error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id) glosses.add(cs.gloss) for target, rel in cs.relations.items(): if rel == 'sameas': for group in sameas.values(): if target in group: # pragma: no cover group.add(cs.id) break else: sameas[cs.gloss] = {cs.id, target} deprecated = {} for s in sameas.values(): csids = sorted(s, key=lambda j: int(j)) for csid in csids[1:]: assert csid not in deprecated deprecated[csid] = csids[0] for cl in self.conceptlists.values(): if cl.id in broken_cls: continue # pragma: no cover for concept in cl.concepts.values(): if concept.concepticon_id in deprecated: # pragma: no cover error( 'deprecated concept set {0} linked for {1}'.format( concept.concepticon_id, concept.id), cl.id) return exit()
def test(): if not REPOS_PATH.exists(): return # pragma: no cover api = Concepticon(REPOS_PATH) # We collect all cite keys used to refer to references. all_refs = set() for meta in api.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error('column names in {0} but not in json-specs'.format(meta.id), 'name') for i, value in enumerate(meta.values.values()): if set(value.keys()) != cnames_schema: # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') for ref in split(meta.meta.get('dc:references') or ''): all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(api.conceptlists.values()): for ref in cl.refs: if ref not in api.bibliography: # pragma: no cover error('invalid bibtex record: {0}'.format(ref), 'conceptlists.tsv', i + 2) all_refs.add(ref) refs_in_text = re.findall(BIB_PATTERN, cl.note) for ref in refs_in_text: all_refs.add(ref) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in api.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') for ref in api.bibliography: if ref not in all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(api.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in api.conceptsets.values()), } for i, rel in enumerate(api.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error( 'invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in api.data_path('conceptlists').glob('*.tsv'): if fname.stem not in api.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format(fname.name), '') for cl in api.conceptlists.values(): for i, concept in enumerate(cl.concepts.values()): if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error('missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None)): error('missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val and val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')