def test_normalize_name(): from clldutils.misc import normalize_name assert normalize_name('class') == 'class_' assert normalize_name('a-name') == 'a_name' assert normalize_name('a näme') == 'a_name' assert normalize_name('Name') == 'Name' assert normalize_name('') == '_' assert normalize_name('1') == '_1'
def test(): if not data_path().exists(): return # pragma: no cover # load bibtex bib = Database.from_file(data_path('references', 'references.bib')) assert bib cls = { cl.name: read_tsv(cl, unique=None) for cl in conceptlists() if not cl.stem.startswith('.')} read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: # pragma: no cover error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) # We collect all cite keys used to refer to references. all_refs = set() for source in concept_set_meta(): specs = load(source.parent.joinpath(source.stem + '.tsv-metadata.json')) tsv = read_tsv(source, unique='CONCEPTICON_ID') cnames = [var['name'] for var in specs['tableSchema']['columns']] if not [n for n in cnames if n in list(tsv[0][1])]: # pragma: no cover error('column names in {0} but not in json-specs'.format(source.stem), 'name') for i, line in tsv: if len(line) != len(cnames): # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(source.stem, i), 'name') if 'dc:references' in specs: all_refs.add(specs['dc:references']) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited1, visited2 = set(), set() tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in bib.keymap and ref not in visited1: # pragma: no cover error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited1.add(ref) else: # pragma: no cover # we fail when author/editor, or year, or title/booktitle are missing if 'Title' not in bib[ref] \ and 'Booktitle' not in bib[ref] \ and ref not in visited2: error('missing bibtex title in record "%s"' % ref, clmd, i) visited2.add(ref) if 'Author' not in bib[ref] and 'Editor' not in bib[ref]: error('missing bibtex author/editor in record "%s"' % ref, clmd, i) visited2.add(ref) if 'Year' not in bib[ref]: error('missing bibtex year in record "%s"' % ref, clmd, i) visited2.add(ref) all_refs.add(ref) for tag in split_ids(cl['TAGS']): if tag not in tags: # pragma: no cover error('invalid cl type: %s' % tag, clmd, i) for i, ref in enumerate(bib.keymap): if ref not in all_refs: # pragma: no cover error('bibtex record %s is in the references but not referenced in the data.' % ref, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = [f.stem for f in data_path('sources').glob('*.pdf')] no_pdf_for_source = set() for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: # pragma: no cover no_pdf_for_source.add(ref) if no_pdf_for_source: # pragma: no cover warning( '\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in cls.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: # pragma: no cover error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: # pragma: no cover cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', '')): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') for lg in split(cl.get('SOURCE_LANGUAGE', '')): if not concept.get(lg.upper()): # pragma: no cover error('missing source language translation %s' % lg.upper(), name, line) if not NUMBER_PATTERN.match(concept['NUMBER']): # pragma: no cover error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: # pragma: no cover error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: # pragma: no cover error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def test(): conceptlists = { cl.name: read_tsv(cl, unique=None) for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv') if not cl.stem.startswith('.') } read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) for source in read_metadata(data_path('concept_set_meta')): specs = json.loads( open(data_path('concept_set_meta', source + '.tsv-metadata.json')).read()) tsv = read_tsv(data_path('concept_set_meta', source + '.tsv'), unique='CONCEPTICON_ID') cnames = [var['name'] for var in specs['tableSchema']['columns']] if not [n for n in cnames if n in list(tsv[0][1])]: error('column names in {0} do not json-specs'.format(source), 'name') for i, line in tsv: if len(line) != len(cnames): error( 'meta data {0} contains irregular number of columns in line {1}' .format(source, i), 'name') refs = set() with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: refs.add(match.group('id')) # # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited = [] tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in refs and ref not in visited: error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited += [ref] for tag in split_ids(cl['TAGS']): if tag not in tags: error('invalid cl type: %s' % tag, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = read_sources(data_path('sources')) no_pdf_for_source = [] for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: no_pdf_for_source += [ref] if no_pdf_for_source: warning('\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in conceptlists.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', [])): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') if not NUMBER_PATTERN.match(concept['NUMBER']): error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')
def item(self, row): d = UnicodeDictReader.item(self, row) for name in self.fieldnames: d.setdefault(name, None) return self.cls( **{normalize_name(k): v for k, v in d.items() if k in self.fieldnames})
def test(): conceptlists = { cl.name: read_tsv(cl, unique=None) for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv') if not cl.stem.startswith('.')} read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) refs = set() with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: refs.add(match.group('id')) # # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited = [] tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in refs and ref not in visited: error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited += [ref] for tag in split_ids(cl['TAGS']): if tag not in tags: error('invalid cl type: %s' % tag, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = read_sources(data_path('sources')) no_pdf_for_source = [] for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: no_pdf_for_source += [ref] if no_pdf_for_source: warning( '\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in conceptlists.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', [])): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') if not NUMBER_PATTERN.match(concept['NUMBER']): error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')