Exemple #1
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Exemple #2
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'), dicts=True, delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Exemple #3
0
def link():
    parser = argparse.ArgumentParser(description="""\
Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or
CONCEPTICON_ID is given, the other is added.""")
    parser.add_argument('conceptlist', help='path to conceptlist to complete')
    args = parser.parse_args()

    if not os.path.exists(args.conceptlist):
        args.conceptlist = data_path('conceptlists', args.conceptlist)
        assert os.path.exists(args.conceptlist)

    rewrite(args.conceptlist,
            Linker(os.path.basename(args.conceptlist).replace('.tsv', '')),
            delimiter='\t')
Exemple #4
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(
        reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i, cl in enumerate(
            PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(
            cl.name[:-4], len(data),
            len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta',
                               'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Exemple #5
0
def link():
    parser = argparse.ArgumentParser(
        description="""\
Complete linking of concepts to concept sets. If either CONCEPTICON_GLOSS or
CONCEPTICON_ID is given, the other is added.""")
    parser.add_argument('conceptlist', help='path to conceptlist to complete')
    args = parser.parse_args()

    if not os.path.exists(args.conceptlist):
        args.conceptlist = data_path('conceptlists', args.conceptlist)
        assert os.path.exists(args.conceptlist)

    rewrite(
        args.conceptlist,
        Linker(os.path.basename(args.conceptlist).replace('.tsv', '')),
        delimiter='\t')
Exemple #6
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'),
                         dicts=True,
                         delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Exemple #7
0
def test():
    conceptlists = {
        cl.name: read_tsv(cl, unique=None)
        for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv')
        if not cl.stem.startswith('.')
    }

    read_tsv(data_path('concepticon.tsv'))
    concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS')

    for i, cs in concepticon:
        for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']:
            valid = getattr(data, attr)
            value = cs[attr]
            if value and value not in valid:
                error('invalid %s: %s' % (attr, value),
                      data_path('concepticon.tsv'), i)

    for source in read_metadata(data_path('concept_set_meta')):
        specs = json.loads(
            open(data_path('concept_set_meta',
                           source + '.tsv-metadata.json')).read())
        tsv = read_tsv(data_path('concept_set_meta', source + '.tsv'),
                       unique='CONCEPTICON_ID')
        cnames = [var['name'] for var in specs['tableSchema']['columns']]
        if not [n for n in cnames if n in list(tsv[0][1])]:
            error('column names in {0} do not json-specs'.format(source),
                  'name')
        for i, line in tsv:
            if len(line) != len(cnames):
                error(
                    'meta data {0} contains irregular number of columns in line {1}'
                    .format(source, i), 'name')

    refs = set()
    with io.open(data_path('references', 'references.bib'),
                 encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                refs.add(match.group('id'))

    #
    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    clmd = data_path('conceptlists.tsv')
    clids = {}
    visited = []
    tags = getattr(data, 'CL_TYPES')
    for i, cl in read_tsv(clmd):
        clids[cl['ID']] = cl
        for ref in split_ids(cl['REFS']):
            if ref not in refs and ref not in visited:
                error('unknown bibtex record "%s" referenced' % ref, clmd, i)
                visited += [ref]
        for tag in split_ids(cl['TAGS']):
            if tag not in tags:
                error('invalid cl type: %s' % tag, clmd, i)

    #
    # make also sure that all sources are accompanied as pdf, but only write a
    # warning if this is not the case
    #
    pdfs = read_sources(data_path('sources'))
    no_pdf_for_source = []
    for i, cl in read_tsv(clmd):
        for ref in split_ids(cl['PDF']):
            if ref not in pdfs:
                no_pdf_for_source += [ref]

    if no_pdf_for_source:
        warning('\n'.join(no_pdf_for_source),
                'no pdf found for {0} sources'.format(len(no_pdf_for_source)))

    ref_cols = {
        'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon),
        'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon),
    }

    for name, concepts in conceptlists.items():
        try:
            cl = clids[name.replace('.tsv', '')]
        except KeyError:
            error('unkown record {0} referenced'.format(name), '', '')
            cl = {}

        missing = []
        for i, (line, concept) in enumerate(concepts):
            if i == 0:
                cols = list(concept.keys())
                try:
                    namedtuple('nt', [normalize_name(n) for n in cols])
                except ValueError as e:
                    error('%s' % e, name, line)
                for lg in split(cl.get('SOURCE_LANGUAGE', [])):
                    if lg.upper() not in cols:
                        error('missing source language col %s' % lg.upper(),
                              name, '')
            if not NUMBER_PATTERN.match(concept['NUMBER']):
                error('invalid concept NUMBER %(NUMBER)s' % concept, name,
                      line)
            for col, values in ref_cols.items():
                if col not in concept:
                    if col not in missing:
                        error('missing column %s' % col, name)
                        missing.append(col)
                elif concept[col] and concept[col] not in values:
                    error('invalid value for %s: %s' % (col, concept[col]),
                          name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')
def test():
    conceptlists = {
        cl.name: read_tsv(cl, unique=None)
        for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv')
        if not cl.stem.startswith('.')}

    read_tsv(data_path('concepticon.tsv'))
    concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS')

    for i, cs in concepticon:
        for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']:
            valid = getattr(data, attr)
            value = cs[attr]
            if value and value not in valid:
                error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i)

    refs = set()
    with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                refs.add(match.group('id'))

    #
    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    clmd = data_path('conceptlists.tsv')
    clids = {}
    visited = []
    tags = getattr(data, 'CL_TYPES')
    for i, cl in read_tsv(clmd):
        clids[cl['ID']] = cl
        for ref in split_ids(cl['REFS']):
            if ref not in refs and ref not in visited:
                error('unknown bibtex record "%s" referenced' % ref, clmd, i)
                visited += [ref]
        for tag in split_ids(cl['TAGS']):
            if tag not in tags:
                error('invalid cl type: %s' % tag, clmd, i)

    #
    # make also sure that all sources are accompanied as pdf, but only write a
    # warning if this is not the case
    #
    pdfs = read_sources(data_path('sources'))
    no_pdf_for_source = []
    for i, cl in read_tsv(clmd):
        for ref in split_ids(cl['PDF']):
            if ref not in pdfs:
                no_pdf_for_source += [ref]
    
    if no_pdf_for_source:
        warning(
            '\n'.join(no_pdf_for_source),
            'no pdf found for {0} sources'.format(len(no_pdf_for_source)))
    
    ref_cols = {
        'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon),
        'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon),
    }

    for name, concepts in conceptlists.items():
        try:
            cl = clids[name.replace('.tsv', '')]
        except KeyError:
            error('unkown record {0} referenced'.format(name), '', '')
            cl = {}

        missing = []
        for i, (line, concept) in enumerate(concepts):
            if i == 0:
                cols = list(concept.keys())
                try:
                    namedtuple('nt', [normalize_name(n) for n in cols])
                except ValueError as e:
                    error('%s' % e, name, line)
                for lg in split(cl.get('SOURCE_LANGUAGE', [])):
                    if lg.upper() not in cols:
                        error('missing source language col %s' % lg.upper(), name, '')
            if not NUMBER_PATTERN.match(concept['NUMBER']):
                error('invalid concept NUMBER %(NUMBER)s' % concept, name, line)
            for col, values in ref_cols.items():
                if col not in concept:
                    if col not in missing:
                        error('missing column %s' % col, name)
                        missing.append(col)
                elif concept[col] and concept[col] not in values:
                    error('invalid value for %s: %s' % (col, concept[col]), name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')