Esempio n. 1
0
    def test_Database(self):
        from clld.lib.bibtex import Record, Database

        db = Database([])
        self.assertEqual(len(db), 0)
        db = Database([Record('book', 'id')])
        self.assertEqual(db[0], db['id'])
        assert unicode(db)
        db = Database.from_file('notexisting.bib')
        self.assertEqual(len(db), 0)
        db = Database.from_file(TESTS_DIR.joinpath('test.bib'))
        self.assertEqual(len(db), 1)
Esempio n. 2
0
    def test_Database(self):
        from clld.lib.bibtex import Record, Database

        db = Database([])
        self.assertEqual(len(db), 0)
        db = Database([Record('book', 'id')])
        self.assertEqual(db[0], db['id'])
        assert unicode(db)
        db = Database.from_file('notexisting.bib')
        self.assertEqual(len(db), 0)
        db = Database.from_file(TESTS_DIR.joinpath('test.bib'))
        self.assertEqual(len(db), 1)
Esempio n. 3
0
    def test_Database(self):
        from clld.lib.bibtex import Record, Database

        db = Database([])
        self.assertEqual(len(db), 0)
        db = Database([Record('book', 'id')])
        self.assertEqual(db[0], db['id'])
        assert text_type(db)
        db = Database.from_file('notexisting.bib')
        self.assertEqual(len(db), 0)
        db = Database.from_file(TESTS_DIR.joinpath('test.bib'))
        self.assertEqual(len(db), 1)
        assert '@' in db[0]['title']
        assert [r for r in db]
        self.assertRaises(NotImplementedError, db.format, 'txt')
Esempio n. 4
0
    def test_Database(self):
        from clld.lib.bibtex import Record, Database

        db = Database([])
        self.assertEqual(len(db), 0)
        db = Database([Record('book', 'id')])
        self.assertEqual(db[0], db['id'])
        assert text_type(db)
        db = Database.from_file('notexisting.bib')
        self.assertEqual(len(db), 0)
        db = Database.from_file(TESTS_DIR.joinpath('test.bib'))
        self.assertEqual(len(db), 1)
        assert '@' in db[0]['title']
        assert [r for r in db]
        self.assertRaises(NotImplementedError, db.format, 'txt')
Esempio n. 5
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'),
                             lowercase=True)
    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Esempio n. 6
0
def get_bib(args):
    uploaded = load(args.data_file('repos', 'cdstar.json'))
    fname_to_cdstar = {}
    for type_ in ['texts', 'docs', 'data']:
        for hash_, paths in load(args.data_file('repos', type_ + '.json')).items():
            if hash_ in uploaded:
                for path in paths:
                    fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    for hash_, paths in load(args.data_file('repos', 'edmond.json')).items():
        if hash_ in uploaded:
            for path in paths:
                fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_]
    db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True)
    for rec in db:
        doc = Document(rec)
        newurls = []
        for url in rec.get('url', '').split(';'):
            if not url.strip():
                continue
            if url.endswith('sequence=1'):
                newurls.append(url)
                continue
            url = URL(url.strip())
            if url.host() in ['dogonlanguages.org', 'github.com', '']:
                fname = url.path().split('/')[-1]
                doc.files.append((fname, fname_to_cdstar[fname]))
            else:
                newurls.append(url.as_string())
        doc.rec['url'] = '; '.join(newurls)
        yield doc
Esempio n. 7
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cognition.__name__,
        name="COSTATOL",
        description="Cognitive Structures across the Tree of Life",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cognition.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    #
    # TODO: add editors!
    #

    for rec in Database.from_file(args.data_file('sources.bib')):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    contrib = common.Contribution(id='costatol', name='COSTATOL')
    for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True):
        param = data['Parameter'].get(datapoint['cognitive capacity'])
        if not param:
            name = datapoint['cognitive capacity']
            param = data.add(common.Parameter, name, id=slug(name), name=name)

        species = data['Language'].get(datapoint['species'])
        if not species:
            name = datapoint['species']
            species = data.add(common.Language, name, id=slug(name), name=name)

        vid = '%s-%s' % (species.id, param.id)
        vs = data.add(
            common.ValueSet,
            vid,
            id=vid,
            language=species,
            parameter=param,
            contribution=contrib)
        data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs)
        match = source_pattern.match(datapoint['source'])
        if match:
            DBSession.add(common.ValueSetReference(
                valueset=vs,
                source=data['Source'][match.group('key')],
                description=match.group('pages')))

    for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True):
        data['Language'][species.name].longitude = species.longitude
        data['Language'][species.name].latitude = species.latitude
Esempio n. 8
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
Esempio n. 9
0
def test_languoid_map_and_table(selenium):
    map_ = selenium.get_map('/resource/languoid/id/ghad1239')
    map_.test_show_marker()
    dt = selenium.get_datatable('/resource/languoid/id/berb1260')
    dt.filter('doctype', 'grammar')
    dt.sort('Year')
    dt.sort('Title')
    recs = dt.get_info().filtered
    assert not os.listdir(str(selenium.downloads))
    dt.download('bib')
    time.sleep(1.5)
    bib = Database.from_file(os.path.join(str(selenium.downloads), 'glottolog-refs.bib'))
    assert recs == len(bib)
Esempio n. 10
0
 def test_languoid_map_and_table(self):
     map_ = self.get_map('/resource/languoid/id/berb1260')
     map_.test_show_marker()
     dt = self.get_datatable('/resource/languoid/id/berb1260')
     dt.filter('doctype', 'grammar')
     dt.sort('Year')
     dt.sort('Title')
     recs = dt.get_info().filtered
     assert not self.downloads.listdir()
     dt.download('bib')
     time.sleep(1.5)
     bib = Database.from_file(self.downloads.joinpath('glottolog-refs.bib'))
     self.assertEqual(recs, len(bib))
Esempio n. 11
0
 def test_languoid_map_and_table(self):
     map_ = self.get_map('/resource/languoid/id/ghad1239')
     map_.test_show_marker()
     dt = self.get_datatable('/resource/languoid/id/berb1260')
     dt.filter('doctype', 'grammar')
     dt.sort('Year')
     dt.sort('Title')
     recs = dt.get_info().filtered
     assert not self.downloads.listdir()
     dt.download('bib')
     time.sleep(1.5)
     bib = Database.from_file(self.downloads.joinpath('glottolog-refs.bib'))
     self.assertEqual(recs, len(bib))
Esempio n. 12
0
def get_obsolete_refs(args):
    """compute all refs that no longer have an equivalent in the bib file.
    """
    refs = []
    known_ids = {}
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    for rec in bib:
        known_ids[rec['glottolog_ref_id']] = 1

    for row in DBSession.query(Ref.id):
        if row[0] not in known_ids:
            refs.append(row[0])

    with open(args.data_file(args.version, 'obsolete_refs.json'), 'w') as fp:
        json.dump(refs, fp)

    return bib
Esempio n. 13
0
def add_sources(args, data):
    bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True)
    ext = [Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@'))]

    for rec in chain(ext, bib):
        if rec.id not in data['Source']:
            data.add(Source, rec.id, _obj=bibtex2source(rec))

    #
    # add aliases to lookup records with bibtex keys with numeric prefixes without
    # specifying the prefix
    #
    for key in list(data['Source'].keys()):
        if '_' in key:
            no, rem = key.split('_', 1)
            try:
                int(no)
                if rem not in data['Source']:
                    data['Source'][rem] = data['Source'][key]
            except (ValueError, TypeError):
                pass
Esempio n. 14
0
def test():
    if not data_path().exists():
        return  # pragma: no cover

    # load bibtex
    bib = Database.from_file(data_path('references', 'references.bib'))
    assert bib

    cls = {
        cl.name: read_tsv(cl, unique=None)
        for cl in conceptlists() if not cl.stem.startswith('.')}

    read_tsv(data_path('concepticon.tsv'))
    concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS')

    for i, cs in concepticon:
        for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']:
            valid = getattr(data, attr)
            value = cs[attr]
            if value and value not in valid:  # pragma: no cover
                error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i)

    # We collect all cite keys used to refer to references.
    all_refs = set()
    for source in concept_set_meta():
        specs = load(source.parent.joinpath(source.stem + '.tsv-metadata.json'))
        tsv = read_tsv(source, unique='CONCEPTICON_ID')
        cnames = [var['name'] for var in specs['tableSchema']['columns']]
        if not [n for n in cnames if n in list(tsv[0][1])]:  # pragma: no cover
            error('column names in {0} but not in json-specs'.format(source.stem), 'name')
        for i, line in tsv:
            if len(line) != len(cnames):  # pragma: no cover
                error('meta data {0} contains irregular number of columns in line {1}'
                      .format(source.stem, i), 'name')
        if 'dc:references' in specs:
            all_refs.add(specs['dc:references'])

    # Make sure only records in the BibTeX file references.bib are referenced by
    # concept lists.
    clmd = data_path('conceptlists.tsv')
    clids = {}
    visited1, visited2 = set(), set()
    tags = getattr(data, 'CL_TYPES')

    for i, cl in read_tsv(clmd):
        clids[cl['ID']] = cl
        for ref in split_ids(cl['REFS']):
            if ref not in bib.keymap and ref not in visited1:  # pragma: no cover
                error('unknown bibtex record "%s" referenced' % ref, clmd, i)
                visited1.add(ref)
            else:  # pragma: no cover
                # we fail when author/editor, or year, or title/booktitle are missing
                if 'Title' not in bib[ref] \
                        and 'Booktitle' not in bib[ref] \
                        and ref not in visited2:
                    error('missing bibtex title in record "%s"' % ref, clmd, i)
                    visited2.add(ref)
                if 'Author' not in bib[ref] and 'Editor' not in bib[ref]:
                    error('missing bibtex author/editor in record "%s"' % ref, clmd, i)
                    visited2.add(ref)
                if 'Year' not in bib[ref]:
                    error('missing bibtex year in record "%s"' % ref, clmd, i)
                    visited2.add(ref)
            all_refs.add(ref)

        for tag in split_ids(cl['TAGS']):
            if tag not in tags:  # pragma: no cover
                error('invalid cl type: %s' % tag, clmd, i)

    for i, ref in enumerate(bib.keymap):
        if ref not in all_refs:  # pragma: no cover
            error('bibtex record %s is in the references but not referenced in the data.'
                  % ref, clmd, i)

    #
    # make also sure that all sources are accompanied as pdf, but only write a
    # warning if this is not the case
    #
    pdfs = [f.stem for f in data_path('sources').glob('*.pdf')]
    no_pdf_for_source = set()
    for i, cl in read_tsv(clmd):
        for ref in split_ids(cl['PDF']):
            if ref not in pdfs:  # pragma: no cover
                no_pdf_for_source.add(ref)
    
    if no_pdf_for_source:  # pragma: no cover
        warning(
            '\n'.join(no_pdf_for_source),
            'no pdf found for {0} sources'.format(len(no_pdf_for_source)))
    
    ref_cols = {
        'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon),
        'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon),
    }

    for name, concepts in cls.items():
        try:
            cl = clids[name.replace('.tsv', '')]
        except KeyError:  # pragma: no cover
            error('unkown record {0} referenced'.format(name), '', '')
            cl = {}

        missing = []
        for i, (line, concept) in enumerate(concepts):
            if i == 0:  # pragma: no cover
                cols = list(concept.keys())
                try:
                    namedtuple('nt', [normalize_name(n) for n in cols])
                except ValueError as e:
                    error('%s' % e, name, line)
                for lg in split(cl.get('SOURCE_LANGUAGE', '')):
                    if lg.upper() not in cols:
                        error('missing source language col %s' % lg.upper(), name, '')

            for lg in split(cl.get('SOURCE_LANGUAGE', '')):
                if not concept.get(lg.upper()):  # pragma: no cover
                    error('missing source language translation %s' % lg.upper(), name, line)
            if not NUMBER_PATTERN.match(concept['NUMBER']):  # pragma: no cover
                error('invalid concept NUMBER %(NUMBER)s' % concept, name, line)
            for col, values in ref_cols.items():
                if col not in concept:
                    if col not in missing:  # pragma: no cover
                        error('missing column %s' % col, name)
                        missing.append(col)
                elif concept[col] and concept[col] not in values:  # pragma: no cover
                    error('invalid value for %s: %s' % (col, concept[col]), name, line)

    if not SUCCESS:  # pragma: no cover
        raise ValueError('integrity checks failed!')
Esempio n. 15
0
def main(args):  # pragma: no cover
    bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8')
    count = 0
    skipped = 0
    changes = {}

    with transaction.manager:
        update_providers(args)
        DBSession.flush()
        provider_map = get_map(Provider)
        macroarea_map = get_map(Macroarea)
        doctype_map = get_map(Doctype)

        known_ids = set(r[0] for r in DBSession.query(Ref.pk))

        languoid_map = {}
        for l in DBSession.query(Languoid):
            if l.hid:
                languoid_map[l.hid] = l
            languoid_map[l.id] = l

        for i, rec in enumerate(bib):
            if i and i % 1000 == 0:
                print i, 'records done', count, 'changed'

            if len(rec.keys()) < 6:
                # not enough information!
                skipped += 1
                continue

            changed = False
            assert rec.get('glottolog_ref_id')
            id_ = int(rec.get('glottolog_ref_id'))
            if args.mode != 'update' and id_ in known_ids:
                continue
            ref = DBSession.query(Source).get(id_)
            update = True if ref else False

            kw = {
                'pk': id_,
                'bibtex_type': rec.genre,
                'id': str(id_),
                'jsondata': {'bibtexkey': rec.id},
            }

            for source, target in FIELD_MAP.items():
                if target is None:
                    continue
                value = rec.get(source)
                if value:
                    value = unescape(value)
                    if target:
                        kw[target] = CONVERTER.get(source, lambda x: x)(value)
                    else:
                        kw['jsondata'][source] = value

            if kw['jsondata'].get('hhtype'):
                trigger = ca_trigger(kw['jsondata']['hhtype'])
                if trigger:
                    kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger

            # try to extract numeric year, startpage, endpage, numberofpages, ...
            if kw.get('year'):
                # prefer years in brackets over the first 4-digit number.
                match = PREF_YEAR_PATTERN.search(kw.get('year'))
                if match:
                    kw['year_int'] = int(match.group('year'))
                else:
                    match = YEAR_PATTERN.search(kw.get('year'))
                    if match:
                        kw['year_int'] = int(match.group('year'))

            if kw.get('publisher'):
                p = kw.get('publisher')
                if ':' in p:
                    address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
                    if not 'address' in kw or kw['address'] == address:
                        kw['address'], kw['publisher'] = address, publisher

            if rec.get('numberofpages'):
                try:
                    kw['pages_int'] = int(rec.get('numberofpages').strip())
                except ValueError:
                    pass

            if kw.get('pages'):
                start, end, number = compute_pages(kw['pages'])
                if start is not None:
                    kw['startpage_int'] = start
                if end is not None:
                    kw['endpage_int'] = end
                if number is not None and 'pages_int' not in kw:
                    kw['pages_int'] = number

            if update:
                for k in kw.keys():
                    if k == 'pk':
                        continue
                    v = getattr(ref, k)
                    if kw[k] != v:
                        if k == 'jsondata':
                            d = ref.jsondata or {}
                            d.update(**kw[k])
                            for s, t in FIELD_MAP.items():
                                if t is None and s in d:
                                    del d[s]
                            ref.jsondata = d
                        else:
                            print k, '--', v
                            print k, '++', kw[k]
                            setattr(ref, k, kw[k])
                            changed = True
                            if ref.id in changes:
                                changes[ref.id][k] = ('%s' % v, '%s' % kw[k])
                            else:
                                changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])}
            else:
                changed = True
                ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw)

            ref.description = ref.title or ref.booktitle
            ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.')

            def append(attr, obj):
                if obj and obj not in attr:
                    attr.append(obj)
                    return True

            a, r = update_relationship(
                ref.macroareas,
                [macroarea_map[name] for name in
                 set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))])
            changed = changed or a or r

            for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])):
                result = append(ref.providers, provider_map[slug(name)])
                changed = changed or result

            a, r = update_relationship(
                ref.doctypes,
                [doctype_map[m.group('name')] for m in
                 DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))])
            changed = changed or a or r

            if not update:
                DBSession.add(ref)

            if changed:
                count += 1
                ref.doctypes_str = ', '.join(o.id for o in ref.doctypes)
                ref.providers_str = ', '.join(o.id for o in ref.providers)

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'

    DBSession.execute("update source set description = title where description is null and title is not null;")
    DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;")

    for row in list(DBSession.execute(
            "select pk, pages, pages_int, startpage_int from source where pages_int < 0")):
        pk, pages, number, start = row
        _start, _end, _number = compute_pages(pages)
        if _number > 0 and _number != number:
            DBSession.execute(
                "update source set pages_int = %s, startpage_int = %s where pk = %s",
                (_number, _start, pk))
            DBSession.execute(
                "update ref set endpage_int = %s where pk = %s",
                (_end, pk))

    return changes
Esempio n. 16
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data',
                                           'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)
    for i, name in enumerate(
        ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(common.Language, 'eng', id='eng', name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(data_path('references', 'references.bib'),
                                  lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(
                common.Source_files(mime_type='application/pdf',
                                    object_pk=source.pk,
                                    jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(models.ConceptSet,
                 concept.ID,
                 id=concept.ID,
                 name=concept.GLOSS,
                 description=concept.DEFINITION,
                 semanticfield=concept.SEMANTICFIELD,
                 ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(
            models.Relation(source=data['ConceptSet'][rel.SOURCE],
                            target=data['ConceptSet'][rel.TARGET],
                            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(source=data['Source'][id_],
                                         contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(common.Contributor,
                                   name,
                                   id=slug(name),
                                   name=name)
            DBSession.add(
                common.ContributionContributor(ord=i,
                                               contribution=conceptlist,
                                               contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(
        ):
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(
                common.Contribution_data(object_pk=conceptlist.pk,
                                         key=k,
                                         value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS',
                                    getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS',
                                    None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l)
                               for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key,
                                      value=value,
                                      object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(id=fname.stem,
                                           name=md['dc:title'],
                                           description=md['dc:description'],
                                           url=md['dc:source'],
                                           jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(metaprovider=provider,
                                                  conceptset=data['ConceptSet']
                                                  [meta['CONCEPTICON_ID']],
                                                  key=k,
                                                  value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
Esempio n. 17
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Esempio n. 18
0
def import_dataset(path,
                   data,
                   languoids,
                   invalid_features,
                   add_missing_features=False):
    # look for metadata
    # look for sources
    # then loop over values

    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)

    contrib = GrambankContribution(id=basename,
                                   name=basename,
                                   desc=languoids[basename].name)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(Contributor,
                               contributor_id,
                               id=contributor_id,
                               name='%s' % contributor_name)
    DBSession.add(
        ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {
        f['properties']['glottocode']: f
        for f in md.get('features', [])
    }

    for i, row in enumerate(
            reader(path,
                   dicts=True,
                   quoting=csv.QUOTE_NONE,
                   delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature,
                                     row['Feature_ID'],
                                     id=row['Feature_ID'],
                                     name=row.get('Feature',
                                                  row['Feature_ID']))
            else:
                invalid_features.update([row['Feature_ID']])
                continue

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            languoid = languoids.get(row['Language_ID'])
            if languoid is None:
                print('Skipping, no Glottocode found for %s' %
                      row['Language_ID'])
                continue

            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude
            }
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry'][
                        'coordinates']

            language = data.add(GrambankLanguage,
                                row['Language_ID'],
                                id=row['Language_ID'],
                                name=gl_md['name'],
                                latitude=gl_md.get('latitude'),
                                longitude=gl_md.get('longitude'))

        domain = {de.abbr: de for de in parameter.domain}
        if not domain.get(row['Value']):
            #print "skipped", row, "not in", domain
            continue

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(ValueSet,
                          vsid,
                          id=vsid,
                          parameter=parameter,
                          language=language,
                          contribution=contrib,
                          source=row['Source'])

        name = row['Value']
        if name in domain:
            name = domain[name].name

        data.add(Value,
                 vid,
                 id=vid,
                 valueset=vs,
                 name=name,
                 description=row['Comment'],
                 domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Esempio n. 19
0
def main(args):
    data = Data()

    for rec in Database.from_file(
            data_path('references.bib'), lowercase=False):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    dataset = common.Dataset(
        id=clts.__name__,
        name="CLTS",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 
        'Thiago Chacon', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t',
            namedtuples=True)):
        if not i % 100:
            print('-', end="")
        key = line.NAME.replace(' ', '_')
        data.add(
                models.SoundSegment,
                key,
                id=key,
                name = line.NAME,
                grapheme=line.GRAPHEME,
                aliases=line.ALIASES,
                representation=len(line.REFLEXES.split(',')),
                reflexes = line.REFLEXES,
                generated = True if line.GENERATED else False,
                unicode = line.UNICODE,
                )
    print('')
    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')



    contributions = {}
    for line in reader(data_path('datasets.tsv'),
            delimiter='\t', namedtuples=True):
        contributions[line.NAME] = data.add(
                models.CLTSDataSet,
                line.NAME,
                id=line.NAME,
                name=line.NAME,
                description=line.DESCRIPTION,
                datatype=line.TYPE
                )
        for id_ in line.REFS.split(', '):
            common.ContributionReference(
                    source=data['Source'][id_],
                    contribution=contributions[line.NAME])
    
        
    visited = set()
    for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t",
            namedtuples=True)):
        if not i % 100: print('-', end='')
        key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME
        if key not in visited:
            sound_id = line.NAME.replace(' ', '_')
            vs = common.ValueSet(
                    id=key,
                    description=line.NAME,
                    language=english,
                    contribution=contributions[line.DATASET],
                    parameter=data['SoundSegment'][sound_id]
                    )
            data.add(
                    models.Grapheme,
                    key,
                    id=key,
                    grapheme=line.GRAPHEME,
                    bipa_grapheme=line.BIPA,
                    name=line.NAME,
                    dataset=line.DATASET,
                    datatype=line.DATATYPE,
                    frequency=line.FREQUENCY or 0,
                    image=line.IMAGE,
                    url=line.URL,
                    valueset=vs
                    )
            visited.add(key)
    print('-')
Esempio n. 20
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(
            data_path('references', 'references.bib'), lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(common.Source_files(
                mime_type='application/pdf',
                object_pk=source.pk,
                jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(
            models.ConceptSet,
            concept.ID,
            id=concept.ID,
            name=concept.GLOSS,
            description=concept.DEFINITION,
            semanticfield=concept.SEMANTICFIELD,
            ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(models.Relation(
            source=data['ConceptSet'][rel.SOURCE],
            target=data['ConceptSet'][rel.TARGET],
            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(
                source=data['Source'][id_], contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(
                    common.Contributor, name, id=slug(name), name=name)
            DBSession.add(common.ContributionContributor(
                ord=i, contribution=conceptlist, contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split():
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(common.Contribution_data(
                object_pk=conceptlist.pk, key=k, value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS', None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key, value=value, object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(
                id=fname.stem,
                name=md['dc:title'],
                description=md['dc:description'],
                url=md['dc:source'],
                jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(
                                metaprovider=provider,
                                conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']],
                                key=k,
                                value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
Esempio n. 21
0
def import_dataset(path, data, icons):
    # look for metadata
    # look for sources
    # then loop over values
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    contrib = Contribution(id=basename, name=basename)

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')):
        if not row['Value'] or not row['Feature_ID']:
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            print('skip value for invalid feature %s' % row['Feature_ID'])
            continue
            #parameter = data.add(
            #    Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))

        language = data['GrambankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            languoid = glottolog.languoid(row['Language_ID'])
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                GrambankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}
        name = row['Value']
        if name in domain:
            name = domain[name].name

        Value(
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        for key, src in data['Source'].items():
            if key in vs.source:
                ValueSetReference(valueset=vs, source=src, key=key)
Esempio n. 22
0
def import_dataset(path, data, icons, add_missing_features = False):
    # look for metadata
    # look for sources
    # then loop over values
    
    dirpath, fname = os.path.split(path)
    basename, ext = os.path.splitext(fname)
    glottolog = Glottolog()

    try:
        contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name)
    except:
        print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename))
        return

    md = {}
    mdpath = path + '-metadata.json'
    if os.path.exists(mdpath):
        md = jsonload(mdpath)
    contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS'))
    contributor_id = slug(contributor_name.last + contributor_name.first)
    contributor = data['Contributor'].get(contributor_id)
    if not contributor:
        contributor = data.add(
            Contributor,
            contributor_id,
            id=contributor_id,
            name='%s' % contributor_name)
    DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor))

    bibpath = os.path.join(dirpath, basename + '.bib')
    if os.path.exists(bibpath):
        for rec in Database.from_file(bibpath):
            if rec['key'] not in data['Source']:
                data.add(Source, rec['key'], _obj=bibtex2source(rec))

    languages = {f['properties']['glottocode']: f for f in md.get('features', [])}

    for i, row in pandas.io.parsers.read_csv(
            path,
            sep=',' if 'c' in ext else '\t',
            encoding='utf-16').iterrows():
        if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']):
            print("Expected columns not found: ", row)
            continue
        vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID'])
        vid = row.get('ID', '%s-%s' % (basename, i + 1))

        parameter = data['Feature'].get(row['Feature_ID'])
        if parameter is None:
            if add_missing_features:
                parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID']))
            else: 
                print(('skip value for invalid feature %s' % row['Feature_ID']))
                continue

        language = data['CulturebankLanguage'].get(row['Language_ID'])
        if language is None:
            # query glottolog!
            try:
                languoid = glottolog.languoid(row['Language_ID'])
            except AttributeError:
                print(('Skipping, no Glottocode found for %s' % row['Language_ID']))
                continue
            
            gl_md = {
                'name': languoid.name,
                'longitude': languoid.longitude,
                'latitude': languoid.latitude}
            lmd = languages.get(row['Language_ID'])
            if lmd:
                if lmd.get('properties', {}).get('name'):
                    gl_md['name'] = lmd['properties']['name']
                if lmd.get('geometry', {}).get('coordinates'):
                    gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates']

            language = data.add(
                CulturebankLanguage, row['Language_ID'],
                id=row['Language_ID'],
                name=gl_md['name'],
                latitude=gl_md.get('latitude'),
                longitude=gl_md.get('longitude'))

        
        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(
                ValueSet, vsid,
                id=vsid,
                parameter=parameter,
                language=language,
                contribution=contrib,
                source=row['Source'])

        domain = {de.abbr: de for de in parameter.domain}    
        name = row['Value']
        if name in domain:
            name = domain[name].name
        else:
            name = str(name)
            if name in domain:
                name = domain[name].name
            else:
                raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format(
                    row['Language_ID'],
                    row['Feature_ID'],
                    name,
                    {d: de for d, de in domain.items()}))

        data.add(Value,
            vid,
            id=vid,
            valueset=vs,
            name=name,
            description=row['Comment'],
            domainelement=domain.get(row['Value']))

        print(".", end="")
        if vs.source is not None:
            for key, src in list(data['Source'].items()):
                if key in vs.source:
                    ValueSetReference(valueset=vs, source=src, key=key)
Esempio n. 23
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Esempio n. 24
0
def update_reflang(args):
    with open(args.data_file('brugmann_noderefs.json')) as fp:
        brugmann_noderefs = json.load(fp)

    ignored, obsolete, changed, unknown = 0, 0, 0, {}
    languoid_map = {}
    for l in DBSession.query(Languoid).filter(Languoid.hid != None):
        languoid_map[l.hid] = l.pk

    lgcodes = {}
    for rec in Database.from_file(
            args.data_file(args.version, 'refs.bib'), encoding='utf8'):
        lgcode = ''
        for f in 'lgcode lcode lgcde lgcoe lgcosw'.split():
            if rec.get(f):
                lgcode = rec[f]
                break
        if len(lgcode) == 3 or lgcode.startswith('NOCODE_'):
            lgcode = '[' + lgcode + ']'
        lgcodes[rec.get('glottolog_ref_id', None)] = lgcode

    #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000):
    for ref in page_query(
            DBSession.query(Ref).order_by(desc(Source.pk)),
            n=10000,
            commit=True,
            verbose=True):
        # disregard iso change requests:
        if ref.description and ref.description.startswith('Change Request Number '):
            ignored += 1
            continue

        if ref.id not in lgcodes:
            # remove all language relations for refs no longer in bib!
            update_relationship(ref.languages, [])
            obsolete += 1
            continue

        language_note = lgcodes[ref.id]
        trigger = ca_trigger(language_note)
        if trigger:
            ref.ca_language_trigger, ref.language_note = trigger
        else:
            ref.language_note = language_note

        remove = brugmann_noderefs['delete'].get(str(ref.pk), [])

        # keep relations to non-language languoids:
        langs = [
            l for l in ref.languages if
            (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove]
        langs_pk = [l.pk for l in langs]

        # add relations from filemaker data:
        for lpk in brugmann_noderefs['create'].get(str(ref.pk), []):
            if lpk not in langs_pk:
                l = Languoid.get(lpk, default=None)
                if l:
                    #print 'relation added according to brugmann data'
                    langs.append(l)
                    langs_pk.append(l.pk)
                else:
                    print 'brugmann relation for non-existing languoid'

        for code in set(get_codes(ref)):
            if code not in languoid_map:
                unknown[code] = 1
                continue
            lpk = languoid_map[code]
            if lpk in remove:
                print ref.name, ref.id, '--', l.name, l.id
                print 'relation removed according to brugmann data'
            else:
                if lpk not in langs_pk:
                    langs.append(DBSession.query(Languoid).get(lpk))
                    langs_pk.append(lpk)

        a, r = update_relationship(ref.languages, langs)
        if a or r:
            changed += 1

    print ignored, 'ignored'
    print obsolete, 'obsolete'
    print changed, 'changed'
    print 'unknown codes', unknown.keys()
Esempio n. 25
0
def get_bib(args):
    return Database.from_file(args.data_dir.joinpath('scripts', 'monster-utf8.bib'))
Esempio n. 26
0
                            print '--> unknown code:', code.encode('utf8')
                    else:
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'


if __name__ == '__main__':
    args = parsed_args((('--mode',), dict(default='insert')))
    main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'), args.mode)
Esempio n. 27
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Esempio n. 28
0
def main(args):  # pragma: no cover
    data = Data()
    clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data'
    clts_repos = CLTS(clts_repos)
    print(clts_repos.repos)
    version = 'v2.1.0' # assert_release(clts_repos.repos)

    for rec in Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    dataset = common.Dataset(
        id='clts',
        name="CLTS {0}".format(version),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate([
        'Johann-Mattis List',
        'Cormac Anderson',
        'Tiago Tresoldi',
        'Robert Forkel',
    ]):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for line in args.cldf['data/features.tsv']:
        data.add(
            models.Feature,
            line['ID'],
            id=line['ID'],
            name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']),
            sound_type=line['TYPE'],
            feature=line['FEATURE'],
            value=line['VALUE'],
        )

    DBSession.add(models.SoundSegment(
        id='NA',
        name='<NA>',
        description='<NA>',
        type='marker',
        generated=True,
        unicode='',
        color='#bbbbbb',
    ))
    for line in args.cldf['data/sounds.tsv']:
        s = data.add(
            models.SoundSegment,
            line['ID'],
            id=line['ID'],
            name=line['GRAPHEME'],
            description=line['NAME'],
            type=line['TYPE'],
            generated=line['GENERATED'],
            unicode=' / '.join(line['UNICODE']),
            color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']),
        )
        if s.color == '0':
            s.color = '#bbbbbb'
        assert s.color in LEGEND
    DBSession.flush()

    seen = set()
    for line in args.cldf['data/sounds.tsv']:
        for fid in line['FEATURES']:
            spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk
            if (spk, fpk) not in seen:
                DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk))
                seen.add((spk, fpk))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    for line in args.cldf['sources/index.tsv']:
        c = data.add(
            models.Transcription,
            line['NAME'],
            id=line['NAME'],
            name=line['NAME'],
            description=line['DESCRIPTION'].replace(':bib:', '/sources/'),
            datatype=getattr(models.Datatype, line['TYPE'])
        )
        for ref in line.get('REFS', []):
            common.ContributionReference(source=data['Source'][ref], contribution=c)

    sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl
    image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl

    for line in args.cldf['data/graphemes.tsv']:
        key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME']
        if key not in data['Grapheme']:
            sound_id = line['NAME'].replace(' ', '_')
            vs = data['ValueSet'].get((line['DATASET'], line['NAME']))
            if not vs:
                try:
                    vs = data.add(
                        common.ValueSet,
                        (line['DATASET'], line['NAME']),
                        id=key,
                        description=line['NAME'],
                        language=english,
                        contribution=data['Transcription'][line['DATASET']],
                        parameter=data['SoundSegment'][sound_id]
                    )
                except:
                    print(line)
                    raise
            data.add(
                models.Grapheme,
                key,
                id=key,
                name=line['GRAPHEME'],
                description=line['NAME'],
                url=line['URL'].unsplit() if line['URL'] else None,
                audio=sound_url_template.expand(line) if line['SOUND'] else None,
                image=image_url_template.expand(line) if line['IMAGE'] else None,
                valueset=vs
            )
Esempio n. 29
0
def get_bib(args):
    return Database.from_file(
        args.data_dir.joinpath('scripts', 'monster-utf8.bib'))
Esempio n. 30
0
                        append(ref.languages, languoid_map[code])

            for glottocode in filter(
                    None, kw['jsondata'].get('alnumcodes', '').split(';')):
                if glottocode not in languoid_map:
                    print '--> unknown glottocode:', glottocode.encode('utf8')
                else:
                    append(ref.languages, languoid_map[glottocode])

            if not update:
                #pass
                #
                # TODO!
                #
                DBSession.add(ref)

            if i % 100 == 0:
                print i, 'records done'

            if changed:
                count += 1

        print count, 'records updated or imported'
        print skipped, 'records skipped because of lack of information'


if __name__ == '__main__':
    args = parsed_args((('--mode', ), dict(default='insert')))
    main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'),
         args.mode)
Esempio n. 31
0
def main(args):  # pragma: no cover
    get_repos()
    api = Grambank(REPOS['Grambank'])
    cldf = args.cldf
    data = Data()
    dataset = models.Grambank(
        id=grambank.__name__,
        name="Grambank",
        description="Grambank",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    contributors = {}
    for i, contrib in enumerate(api.contributors):
        contrib = common.Contributor(
            contrib.id,
            id=contrib.id,
            name=contrib.name,
        )
        common.Editor(dataset=dataset, contributor=contrib, ord=i)
        DBSession.add(contrib)
        DBSession.flush()
        contributors[contrib.id] = contrib.pk
    contributions = {r['ID']: r for r in cldf['LanguageTable']}

    DBSession.add(dataset)

    for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)),
                    desc='sources'):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()
    sources = {k: v.pk for k, v in data['Source'].items()}

    features, codes = import_features(cldf, contributors)
    transaction.commit()

    values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby(
        sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']),
        lambda r: r['Language_ID'],
    )]
    for lid, values in tqdm(values_by_sheet, desc='loading values'):
        transaction.begin()
        import_values(values, contributions[lid], features, codes,
                      contributors, sources)
        transaction.commit()

    transaction.begin()

    glottolog = Glottolog(REPOS['glottolog'])
    languoids = {l.id: l for l in glottolog.languoids()}
    gblangs = DBSession.query(models.GrambankLanguage).all()
    load_families(data,
                  gblangs,
                  glottolog_repos=REPOS['glottolog'],
                  isolates_icon='dcccccc')

    # Add isolates
    for lg in gblangs:
        gl_language = languoids.get(lg.id)
        if not gl_language.family:
            family = data.add(
                Family,
                gl_language.id,
                id=gl_language.id,
                name=gl_language.name,
                description=common.Identifier(
                    name=gl_language.id,
                    type=common.IdentifierType.glottolog.value).url(),
                jsondata={"icon": 'tcccccc'})
            lg.family = family
    coverage.main(glottolog)
    return