Example #1
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]
    
    for cl in sorted(
            PKG_PATH.joinpath('conceptlists').glob('*.tsv'), key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio, Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers, Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists', 'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Example #2
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]

    for cl in sorted(PKG_PATH.joinpath('conceptlists').glob('*.tsv'),
                     key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio,
                  Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers,
                  Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists',
                           'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Example #3
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Example #4
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(
        reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i, cl in enumerate(
            PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(
            cl.name[:-4], len(data),
            len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta',
                               'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Example #5
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i,cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [h for h in header if h not in ['ID', 'CONCEPTICON_ID', 
            'CONCEPTICON_GLOSS', 'ENGLISH', 'GLOSS', 'NUMBER']]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k,v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Example #6
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i, cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [
            h for h in header if h not in [
                'ID', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'ENGLISH',
                'GLOSS', 'NUMBER'
            ]
        ]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k, v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Example #7
0
def test():
    conceptlists = {
        cl.name: read_tsv(cl, unique=None)
        for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv')
        if not cl.stem.startswith('.')
    }

    read_tsv(data_path('concepticon.tsv'))
    concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS')

    for i, cs in concepticon:
        for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']:
            valid = getattr(data, attr)
            value = cs[attr]
            if value and value not in valid:
                error('invalid %s: %s' % (attr, value),
                      data_path('concepticon.tsv'), i)

    for source in read_metadata(data_path('concept_set_meta')):
        specs = json.loads(
            open(data_path('concept_set_meta',
                           source + '.tsv-metadata.json')).read())
        tsv = read_tsv(data_path('concept_set_meta', source + '.tsv'),
                       unique='CONCEPTICON_ID')
        cnames = [var['name'] for var in specs['tableSchema']['columns']]
        if not [n for n in cnames if n in list(tsv[0][1])]:
            error('column names in {0} do not json-specs'.format(source),
                  'name')
        for i, line in tsv:
            if len(line) != len(cnames):
                error(
                    'meta data {0} contains irregular number of columns in line {1}'
                    .format(source, i), 'name')

    refs = set()
    with io.open(data_path('references', 'references.bib'),
                 encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                refs.add(match.group('id'))

    #
    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    clmd = data_path('conceptlists.tsv')
    clids = {}
    visited = []
    tags = getattr(data, 'CL_TYPES')
    for i, cl in read_tsv(clmd):
        clids[cl['ID']] = cl
        for ref in split_ids(cl['REFS']):
            if ref not in refs and ref not in visited:
                error('unknown bibtex record "%s" referenced' % ref, clmd, i)
                visited += [ref]
        for tag in split_ids(cl['TAGS']):
            if tag not in tags:
                error('invalid cl type: %s' % tag, clmd, i)

    #
    # make also sure that all sources are accompanied as pdf, but only write a
    # warning if this is not the case
    #
    pdfs = read_sources(data_path('sources'))
    no_pdf_for_source = []
    for i, cl in read_tsv(clmd):
        for ref in split_ids(cl['PDF']):
            if ref not in pdfs:
                no_pdf_for_source += [ref]

    if no_pdf_for_source:
        warning('\n'.join(no_pdf_for_source),
                'no pdf found for {0} sources'.format(len(no_pdf_for_source)))

    ref_cols = {
        'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon),
        'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon),
    }

    for name, concepts in conceptlists.items():
        try:
            cl = clids[name.replace('.tsv', '')]
        except KeyError:
            error('unkown record {0} referenced'.format(name), '', '')
            cl = {}

        missing = []
        for i, (line, concept) in enumerate(concepts):
            if i == 0:
                cols = list(concept.keys())
                try:
                    namedtuple('nt', [normalize_name(n) for n in cols])
                except ValueError as e:
                    error('%s' % e, name, line)
                for lg in split(cl.get('SOURCE_LANGUAGE', [])):
                    if lg.upper() not in cols:
                        error('missing source language col %s' % lg.upper(),
                              name, '')
            if not NUMBER_PATTERN.match(concept['NUMBER']):
                error('invalid concept NUMBER %(NUMBER)s' % concept, name,
                      line)
            for col, values in ref_cols.items():
                if col not in concept:
                    if col not in missing:
                        error('missing column %s' % col, name)
                        missing.append(col)
                elif concept[col] and concept[col] not in values:
                    error('invalid value for %s: %s' % (col, concept[col]),
                          name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')
def test():
    conceptlists = {
        cl.name: read_tsv(cl, unique=None)
        for cl in PKG_PATH.joinpath('conceptlists').glob('*.tsv')
        if not cl.stem.startswith('.')}

    read_tsv(data_path('concepticon.tsv'))
    concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS')

    for i, cs in concepticon:
        for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']:
            valid = getattr(data, attr)
            value = cs[attr]
            if value and value not in valid:
                error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i)

    refs = set()
    with io.open(data_path('references', 'references.bib'), encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                refs.add(match.group('id'))

    #
    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    clmd = data_path('conceptlists.tsv')
    clids = {}
    visited = []
    tags = getattr(data, 'CL_TYPES')
    for i, cl in read_tsv(clmd):
        clids[cl['ID']] = cl
        for ref in split_ids(cl['REFS']):
            if ref not in refs and ref not in visited:
                error('unknown bibtex record "%s" referenced' % ref, clmd, i)
                visited += [ref]
        for tag in split_ids(cl['TAGS']):
            if tag not in tags:
                error('invalid cl type: %s' % tag, clmd, i)

    #
    # make also sure that all sources are accompanied as pdf, but only write a
    # warning if this is not the case
    #
    pdfs = read_sources(data_path('sources'))
    no_pdf_for_source = []
    for i, cl in read_tsv(clmd):
        for ref in split_ids(cl['PDF']):
            if ref not in pdfs:
                no_pdf_for_source += [ref]
    
    if no_pdf_for_source:
        warning(
            '\n'.join(no_pdf_for_source),
            'no pdf found for {0} sources'.format(len(no_pdf_for_source)))
    
    ref_cols = {
        'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon),
        'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon),
    }

    for name, concepts in conceptlists.items():
        try:
            cl = clids[name.replace('.tsv', '')]
        except KeyError:
            error('unkown record {0} referenced'.format(name), '', '')
            cl = {}

        missing = []
        for i, (line, concept) in enumerate(concepts):
            if i == 0:
                cols = list(concept.keys())
                try:
                    namedtuple('nt', [normalize_name(n) for n in cols])
                except ValueError as e:
                    error('%s' % e, name, line)
                for lg in split(cl.get('SOURCE_LANGUAGE', [])):
                    if lg.upper() not in cols:
                        error('missing source language col %s' % lg.upper(), name, '')
            if not NUMBER_PATTERN.match(concept['NUMBER']):
                error('invalid concept NUMBER %(NUMBER)s' % concept, name, line)
            for col, values in ref_cols.items():
                if col not in concept:
                    if col not in missing:
                        error('missing column %s' % col, name)
                        missing.append(col)
                elif concept[col] and concept[col] not in values:
                    error('invalid value for %s: %s' % (col, concept[col]), name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')