def conceptlists(self): """ :returns: `dict` mapping ConceptList IDs to `Conceptlist` instances. .. note:: Individual concepts can be accessed via `Conceptlist.concepts`. """ return to_dict( Conceptlist(api=self, **lowercase(d)) for d in self.conceptlists_dicts)
def test_Conceptlist(fixturedir, api): clist = Conceptlist.from_file(fixturedir.joinpath('conceptlist.tsv'), api=api) assert len(clist.concepts) == 1 with pytest.raises(ValueError): Conceptlist( api=None, id='xy', author='x', year='1234', alias=None, items=4, list_suffix=None, note='', pages='', pdf='', refs='', source_language='y', tags='', target_language=None, url=None)
def check(self, *clids): errors = [] assert self.retirements print('testing {0} concept lists'.format( len(clids) if clids else len(self.conceptlists))) def _msg(type_, msg, name, line): # pragma: no cover if line: line = ':%s' % line return '%s:%s%s: %s' % (type_.upper(), name, line or '', msg) def error(msg, name, line=0): # pragma: no cover errors.append((msg, name, line)) def warning(msg, name, line=0): # pragma: no cover warnings.warn(_msg('warning', msg, name, line), Warning) for i, d in enumerate(self.conceptlists_dicts, start=1): if (not clids) or d['ID'] in clids: try: Conceptlist(api=self, **lowercase(d)) except ValueError as e: # pragma: no cover error(str(e), 'conceptlists.tsv', i) def exit(): for msg, name, line in errors: print(_msg('error', msg, name, line)) return not bool(errors) if errors: # pragma: no cover return exit() # Exit early in case of structural errors. REF_WITHOUT_LABEL_PATTERN = re.compile( r'[^]]\(:(ref|bib):[A-Za-z0-9\-]+\)') REF_WITHOUT_LINK_PATTERN = re.compile('[^(]:(ref|bib):[A-Za-z0-9-]+') # Make sure all language-specific mappings are well specified iso_langs = [ lang.iso2 for lang in self.vocabularies['COLUMN_TYPES'].values() if isinstance(lang, Languoid) and lang.iso2 ] if len(iso_langs) != len(set(iso_langs)): error( 'Duplicate ISO codes: {}'.format( collections.Counter(iso_langs).most_common(1)), 'concepticon.json') assert set(p.stem.split('-')[1] for p in self.path('mappings').glob('map-*.tsv'))\ .issubset(iso_langs) # We collect all cite keys used to refer to references. all_refs = set() refs_in_bib = set(ref for ref in self.bibliography) for meta in self.metadata.values(): cnames_schema = set(var['name'] for var in meta.meta['tableSchema']['columns']) cnames_tsv = set(list(meta.values.values())[0]) if cnames_tsv - cnames_schema: # pragma: no cover error( 'column names in {0} but not in json-specs'.format( meta.id), 'name') for i, (key, value) in enumerate(meta.values.items()): if set(value.keys()) != cnames_schema: # pragma: no cover error( 'meta data {0} contains irregular number of columns in line {1}' .format(meta.id, i + 2), 'name') if key not in self.conceptsets: # pragma: no cover error( 'meta data {0} references invalid CONCEPTICON_ID {2} in line {1}' .format(meta.id, i + 2, key), 'name') for ref in split(meta.meta.get('dc:references') or ''): if ref not in refs_in_bib: # pragma: no cover error('cited bibtex record not in bib: {0}'.format(ref), 'name') all_refs.add(ref) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. for i, cl in enumerate(self.conceptlists.values()): if clids and cl.id not in clids: continue # pragma: no cover fl = ('conceptlists.tsv', i + 2) for ref in re.findall(BIB_PATTERN, cl.note) + cl.refs: if ref not in refs_in_bib: error('cited bibtex record not in bib: {0}'.format(ref), *fl) else: all_refs.add(ref) for m in REF_WITHOUT_LABEL_PATTERN.finditer(cl.note): error( 'link without label: {0}'.format( m.string[m.start():m.end()]), *fl) for m in REF_WITHOUT_LINK_PATTERN.finditer( cl.note): # pragma: no cover error( 'reference not in link: {0}'.format( m.string[m.start():m.end()]), *fl) for m in REF_PATTERN.finditer(cl.note): if m.group('id') not in self.conceptlists: # pragma: no cover error('invalid conceptlist ref: {0}'.format(m.group('id')), *fl) # make also sure that all sources are accompanied by a PDF, but only write a # warning if this is not the case for ref in cl.pdf: if ref not in self.sources: # pragma: no cover warning('no PDF found for {0}'.format(ref), 'conceptlists.tsv') all_refs.add('List2016a') if not clids: # Only report unused references if we check all concept lists! for ref in refs_in_bib - all_refs: # pragma: no cover error('unused bibtex record: {0}'.format(ref), 'references.bib') ref_cols = { 'concepticon_id': set(self.conceptsets.keys()), 'concepticon_gloss': set(cs.gloss for cs in self.conceptsets.values()), } for i, rel in enumerate(self.relations.raw): for attr, type_ in [ ('SOURCE', 'concepticon_id'), ('TARGET', 'concepticon_id'), ('SOURCE_GLOSS', 'concepticon_gloss'), ('TARGET_GLOSS', 'concepticon_gloss'), ]: if rel[attr] not in ref_cols[type_]: # pragma: no cover error('invalid {0}: {1}'.format(attr, rel[attr]), 'conceptrelations', i + 2) for fname in self.data_path('conceptlists').glob('*.tsv'): if clids and fname.stem not in clids: continue # pragma: no cover if fname.stem not in self.conceptlists: # pragma: no cover error( 'conceptlist missing in conceptlists.tsv: {0}'.format( fname.name), '') broken_cls = [] for cl in self.conceptlists.values(): if clids and cl.id not in clids: continue # pragma: no cover # # Check consistency between the csvw metadata and the column names in the list. # missing_in_md, missing_in_list = [], [] cols_in_md = [] for col in cl.metadata.tableSchema.columns: cnames = [ ] # all names or aliases csvw will recognize for this column if col.name in cols_in_md: # pragma: no cover error( 'Duplicate name ot title in table schema: {0}'.format( col.name), cl.id) cnames.append(col.name) if col.titles: c = col.titles.getfirst() if c in cols_in_md: # pragma: no cover error( 'Duplicate name ot title in table schema: {0}'. format(c), cl.id) cnames.append(c) cols_in_md.extend(cnames) if not any(name in cl.cols_in_list for name in cnames): # Neither name nor title of the column is in the actual list header. missing_in_list.append(col.name) for col in cl.cols_in_list: if col not in cols_in_md: missing_in_md.append(col) for col in missing_in_list: error( 'Column in metadata but missing in list: {0}'.format(col), cl.id) for col in missing_in_md: error( 'Column in list but missing in metadata: {0}'.format(col), cl.id) try: # Now check individual concepts: for i, concept in enumerate(cl.concepts.values()): if not concept.id.startswith(cl.id): # pragma: no cover error( 'concept ID does not match concept list ID pattern %s' % concept.id, cl.id) if concept.concepticon_id: cs = self.conceptsets.get(concept.concepticon_id) if not cs: # pragma: no cover error( 'invalid conceptset ID %s' % concept.concepticon_id, cl.id) elif cs.gloss != concept.concepticon_gloss: # pragma: no cover error( 'wrong conceptset GLOSS for ID {0}: {1} -> {2}' .format(cs.id, concept.concepticon_gloss, cs.gloss), cl.id) if i == 0: # pragma: no cover for lg in cl.source_language: if lg.lower() not in concept.cols: error( 'missing source language col %s' % lg.upper(), cl.id) for lg in cl.source_language: # pragma: no cover if not (concept.attributes.get(lg.lower()) or getattr(concept, lg.lower(), None) or (lg.lower() == 'english' and not concept.gloss)): error( 'missing source language translation %s' % lg, cl.id, i + 2) for attr, values in ref_cols.items(): val = getattr(concept, attr) if val: # check that there are not leading and trailing spaces # (while computationally expensive, this helps catch really # hard to find typos) if val != val.strip(): # pragma: no cover error( "leading or trailing spaces in value for %s: '%s'" % (attr, val), cl.id, i + 2) if val not in values: # pragma: no cover error('invalid value for %s: %s' % (attr, val), cl.id, i + 2) except TypeError as e: # pragma: no cover broken_cls.append(cl.id) error(str(e), cl.id) sameas = {} glosses = set() for cs in self.conceptsets.values(): if cs.gloss in glosses: # pragma: no cover error('duplicate conceptset gloss: {0}'.format(cs.gloss), cs.id) glosses.add(cs.gloss) for target, rel in cs.relations.items(): if rel == 'sameas': for group in sameas.values(): if target in group: # pragma: no cover group.add(cs.id) break else: sameas[cs.gloss] = {cs.id, target} deprecated = {} for s in sameas.values(): csids = sorted(s, key=lambda j: int(j)) for csid in csids[1:]: assert csid not in deprecated deprecated[csid] = csids[0] for cl in self.conceptlists.values(): if cl.id in broken_cls: continue # pragma: no cover for concept in cl.concepts.values(): if concept.concepticon_id in deprecated: # pragma: no cover error( 'deprecated concept set {0} linked for {1}'.format( concept.concepticon_id, concept.id), cl.id) return exit()