def metadata(write_stats=True): """Writes statistics on metadata to readme.""" txt = '# Basic Statistics on Metadata\n\n' cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t")) for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')): data = list(reader(cl, namedtuples=True, delimiter="\t")) txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc)) if write_stats: with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp: fp.write(txt)
def __init__(self, clid): self.clid = clid self.concepts = { 'CONCEPTICON_ID': {}, # maps ID to GLOSS 'CONCEPTICON_GLOSS': {}, # maps GLOSS to ID } for cs in reader(data_path('concepticon.tsv'), dicts=True, delimiter='\t'): self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS'] self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID'] self._cid_index = None self._cgloss_index = None self._link_col = (None, None) self._number_index = None
def link(): parser = argparse.ArgumentParser(description="""\ Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added.""") parser.add_argument('conceptlist', help='path to conceptlist to complete') args = parser.parse_args() if not os.path.exists(args.conceptlist): args.conceptlist = data_path('conceptlists', args.conceptlist) assert os.path.exists(args.conceptlist) rewrite(args.conceptlist, Linker(os.path.basename(args.conceptlist).replace('.tsv', '')), delimiter='\t')
def metadata(write_stats=True): """Writes statistics on metadata to readme.""" txt = '# Basic Statistics on Metadata\n\n' cnc = list( reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t")) for i, cl in enumerate( PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')): data = list(reader(cl, namedtuples=True, delimiter="\t")) txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format( cl.name[:-4], len(data), len(data) / len(cnc)) if write_stats: with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp: fp.write(txt)
def link(): parser = argparse.ArgumentParser( description="""\ Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or CONCEPTICON_ID is given, the other is added.""") parser.add_argument('conceptlist', help='path to conceptlist to complete') args = parser.parse_args() if not os.path.exists(args.conceptlist): args.conceptlist = data_path('conceptlists', args.conceptlist) assert os.path.exists(args.conceptlist) rewrite( args.conceptlist, Linker(os.path.basename(args.conceptlist).replace('.tsv', '')), delimiter='\t')
def test(): conceptlists = { cl.name: read_tsv(cl, unique=None) for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv') if not cl.stem.startswith('.') } read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) for source in read_metadata(data_path('concept_set_meta')): specs = json.loads( open(data_path('concept_set_meta', source + '.tsv-metadata.json')).read()) tsv = read_tsv(data_path('concept_set_meta', source + '.tsv'), unique='CONCEPTICON_ID') cnames = [var['name'] for var in specs['tableSchema']['columns']] if not [n for n in cnames if n in list(tsv[0][1])]: error('column names in {0} do not json-specs'.format(source), 'name') for i, line in tsv: if len(line) != len(cnames): error( 'meta data {0} contains irregular number of columns in line {1}' .format(source, i), 'name') refs = set() with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: refs.add(match.group('id')) # # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited = [] tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in refs and ref not in visited: error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited += [ref] for tag in split_ids(cl['TAGS']): if tag not in tags: error('invalid cl type: %s' % tag, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = read_sources(data_path('sources')) no_pdf_for_source = [] for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: no_pdf_for_source += [ref] if no_pdf_for_source: warning('\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in conceptlists.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', [])): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') if not NUMBER_PATTERN.match(concept['NUMBER']): error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')
def test(): conceptlists = { cl.name: read_tsv(cl, unique=None) for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv') if not cl.stem.startswith('.')} read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) refs = set() with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: refs.add(match.group('id')) # # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited = [] tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in refs and ref not in visited: error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited += [ref] for tag in split_ids(cl['TAGS']): if tag not in tags: error('invalid cl type: %s' % tag, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = read_sources(data_path('sources')) no_pdf_for_source = [] for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: no_pdf_for_source += [ref] if no_pdf_for_source: warning( '\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in conceptlists.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', [])): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') if not NUMBER_PATTERN.match(concept['NUMBER']): error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: raise ValueError('integrity checks failed!')