def test_Database(self): from clld.lib.bibtex import Record, Database db = Database([]) self.assertEqual(len(db), 0) db = Database([Record('book', 'id')]) self.assertEqual(db[0], db['id']) assert unicode(db) db = Database.from_file('notexisting.bib') self.assertEqual(len(db), 0) db = Database.from_file(TESTS_DIR.joinpath('test.bib')) self.assertEqual(len(db), 1)
def test_Database(self): from clld.lib.bibtex import Record, Database db = Database([]) self.assertEqual(len(db), 0) db = Database([Record('book', 'id')]) self.assertEqual(db[0], db['id']) assert text_type(db) db = Database.from_file('notexisting.bib') self.assertEqual(len(db), 0) db = Database.from_file(TESTS_DIR.joinpath('test.bib')) self.assertEqual(len(db), 1) assert '@' in db[0]['title'] assert [r for r in db] self.assertRaises(NotImplementedError, db.format, 'txt')
def add_sources(args, data): bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in chain(ext, bib): if rec.id not in data['Source']: data.add(Source, rec.id, _obj=bibtex2source(rec)) # # add aliases to lookup records with bibtex keys with numeric prefixes without # specifying the prefix # for key in list(data['Source'].keys()): if '_' in key: no, rem = key.split('_', 1) try: int(no) if rem not in data['Source']: data['Source'][rem] = data['Source'][key] except (ValueError, TypeError): pass
def get_bib(args): uploaded = load(args.data_file('repos', 'cdstar.json')) fname_to_cdstar = {} for type_ in ['texts', 'docs', 'data']: for hash_, paths in load(args.data_file('repos', type_ + '.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] for hash_, paths in load(args.data_file('repos', 'edmond.json')).items(): if hash_ in uploaded: for path in paths: fname_to_cdstar[path.split('/')[-1]] = uploaded[hash_] db = Database.from_file(args.data_file('repos', 'Dogon.bib'), lowercase=True) for rec in db: doc = Document(rec) newurls = [] for url in rec.get('url', '').split(';'): if not url.strip(): continue if url.endswith('sequence=1'): newurls.append(url) continue url = URL(url.strip()) if url.host() in ['dogonlanguages.org', 'github.com', '']: fname = url.path().split('/')[-1] doc.files.append((fname, fname_to_cdstar[fname])) else: newurls.append(url.as_string()) doc.rec['url'] = '; '.join(newurls) yield doc
def main(args): data = Data() dataset = common.Dataset( id=cognition.__name__, name="COSTATOL", description="Cognitive Structures across the Tree of Life", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cognition.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # for rec in Database.from_file(args.data_file('sources.bib')): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) contrib = common.Contribution(id='costatol', name='COSTATOL') for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True): param = data['Parameter'].get(datapoint['cognitive capacity']) if not param: name = datapoint['cognitive capacity'] param = data.add(common.Parameter, name, id=slug(name), name=name) species = data['Language'].get(datapoint['species']) if not species: name = datapoint['species'] species = data.add(common.Language, name, id=slug(name), name=name) vid = '%s-%s' % (species.id, param.id) vs = data.add( common.ValueSet, vid, id=vid, language=species, parameter=param, contribution=contrib) data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs) match = source_pattern.match(datapoint['source']) if match: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][match.group('key')], description=match.group('pages'))) for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True): data['Language'][species.name].longitude = species.longitude data['Language'][species.name].latitude = species.latitude
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def test_languoid_map_and_table(selenium): map_ = selenium.get_map('/resource/languoid/id/ghad1239') map_.test_show_marker() dt = selenium.get_datatable('/resource/languoid/id/berb1260') dt.filter('doctype', 'grammar') dt.sort('Year') dt.sort('Title') recs = dt.get_info().filtered assert not os.listdir(str(selenium.downloads)) dt.download('bib') time.sleep(1.5) bib = Database.from_file(os.path.join(str(selenium.downloads), 'glottolog-refs.bib')) assert recs == len(bib)
def test_languoid_map_and_table(self): map_ = self.get_map('/resource/languoid/id/berb1260') map_.test_show_marker() dt = self.get_datatable('/resource/languoid/id/berb1260') dt.filter('doctype', 'grammar') dt.sort('Year') dt.sort('Title') recs = dt.get_info().filtered assert not self.downloads.listdir() dt.download('bib') time.sleep(1.5) bib = Database.from_file(self.downloads.joinpath('glottolog-refs.bib')) self.assertEqual(recs, len(bib))
def test_languoid_map_and_table(self): map_ = self.get_map('/resource/languoid/id/ghad1239') map_.test_show_marker() dt = self.get_datatable('/resource/languoid/id/berb1260') dt.filter('doctype', 'grammar') dt.sort('Year') dt.sort('Title') recs = dt.get_info().filtered assert not self.downloads.listdir() dt.download('bib') time.sleep(1.5) bib = Database.from_file(self.downloads.joinpath('glottolog-refs.bib')) self.assertEqual(recs, len(bib))
def get_obsolete_refs(args): """compute all refs that no longer have an equivalent in the bib file. """ refs = [] known_ids = {} bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') for rec in bib: known_ids[rec['glottolog_ref_id']] = 1 for row in DBSession.query(Ref.id): if row[0] not in known_ids: refs.append(row[0]) with open(args.data_file(args.version, 'obsolete_refs.json'), 'w') as fp: json.dump(refs, fp) return bib
def add_sources(args, data): bib = Database.from_file(args.data_file('phoible-references.bib'), lowercase=True) ext = [Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@'))] for rec in chain(ext, bib): if rec.id not in data['Source']: data.add(Source, rec.id, _obj=bibtex2source(rec)) # # add aliases to lookup records with bibtex keys with numeric prefixes without # specifying the prefix # for key in list(data['Source'].keys()): if '_' in key: no, rem = key.split('_', 1) try: int(no) if rem not in data['Source']: data['Source'][rem] = data['Source'][key] except (ValueError, TypeError): pass
def test(): if not data_path().exists(): return # pragma: no cover # load bibtex bib = Database.from_file(data_path('references', 'references.bib')) assert bib cls = { cl.name: read_tsv(cl, unique=None) for cl in conceptlists() if not cl.stem.startswith('.')} read_tsv(data_path('concepticon.tsv')) concepticon = read_tsv(data_path('concepticon.tsv'), unique='GLOSS') for i, cs in concepticon: for attr in ['SEMANTICFIELD', 'ONTOLOGICAL_CATEGORY']: valid = getattr(data, attr) value = cs[attr] if value and value not in valid: # pragma: no cover error('invalid %s: %s' % (attr, value), data_path('concepticon.tsv'), i) # We collect all cite keys used to refer to references. all_refs = set() for source in concept_set_meta(): specs = load(source.parent.joinpath(source.stem + '.tsv-metadata.json')) tsv = read_tsv(source, unique='CONCEPTICON_ID') cnames = [var['name'] for var in specs['tableSchema']['columns']] if not [n for n in cnames if n in list(tsv[0][1])]: # pragma: no cover error('column names in {0} but not in json-specs'.format(source.stem), 'name') for i, line in tsv: if len(line) != len(cnames): # pragma: no cover error('meta data {0} contains irregular number of columns in line {1}' .format(source.stem, i), 'name') if 'dc:references' in specs: all_refs.add(specs['dc:references']) # Make sure only records in the BibTeX file references.bib are referenced by # concept lists. clmd = data_path('conceptlists.tsv') clids = {} visited1, visited2 = set(), set() tags = getattr(data, 'CL_TYPES') for i, cl in read_tsv(clmd): clids[cl['ID']] = cl for ref in split_ids(cl['REFS']): if ref not in bib.keymap and ref not in visited1: # pragma: no cover error('unknown bibtex record "%s" referenced' % ref, clmd, i) visited1.add(ref) else: # pragma: no cover # we fail when author/editor, or year, or title/booktitle are missing if 'Title' not in bib[ref] \ and 'Booktitle' not in bib[ref] \ and ref not in visited2: error('missing bibtex title in record "%s"' % ref, clmd, i) visited2.add(ref) if 'Author' not in bib[ref] and 'Editor' not in bib[ref]: error('missing bibtex author/editor in record "%s"' % ref, clmd, i) visited2.add(ref) if 'Year' not in bib[ref]: error('missing bibtex year in record "%s"' % ref, clmd, i) visited2.add(ref) all_refs.add(ref) for tag in split_ids(cl['TAGS']): if tag not in tags: # pragma: no cover error('invalid cl type: %s' % tag, clmd, i) for i, ref in enumerate(bib.keymap): if ref not in all_refs: # pragma: no cover error('bibtex record %s is in the references but not referenced in the data.' % ref, clmd, i) # # make also sure that all sources are accompanied as pdf, but only write a # warning if this is not the case # pdfs = [f.stem for f in data_path('sources').glob('*.pdf')] no_pdf_for_source = set() for i, cl in read_tsv(clmd): for ref in split_ids(cl['PDF']): if ref not in pdfs: # pragma: no cover no_pdf_for_source.add(ref) if no_pdf_for_source: # pragma: no cover warning( '\n'.join(no_pdf_for_source), 'no pdf found for {0} sources'.format(len(no_pdf_for_source))) ref_cols = { 'CONCEPTICON_ID': set(cs[1]['ID'] for cs in concepticon), 'CONCEPTICON_GLOSS': set(cs[1]['GLOSS'] for cs in concepticon), } for name, concepts in cls.items(): try: cl = clids[name.replace('.tsv', '')] except KeyError: # pragma: no cover error('unkown record {0} referenced'.format(name), '', '') cl = {} missing = [] for i, (line, concept) in enumerate(concepts): if i == 0: # pragma: no cover cols = list(concept.keys()) try: namedtuple('nt', [normalize_name(n) for n in cols]) except ValueError as e: error('%s' % e, name, line) for lg in split(cl.get('SOURCE_LANGUAGE', '')): if lg.upper() not in cols: error('missing source language col %s' % lg.upper(), name, '') for lg in split(cl.get('SOURCE_LANGUAGE', '')): if not concept.get(lg.upper()): # pragma: no cover error('missing source language translation %s' % lg.upper(), name, line) if not NUMBER_PATTERN.match(concept['NUMBER']): # pragma: no cover error('invalid concept NUMBER %(NUMBER)s' % concept, name, line) for col, values in ref_cols.items(): if col not in concept: if col not in missing: # pragma: no cover error('missing column %s' % col, name) missing.append(col) elif concept[col] and concept[col] not in values: # pragma: no cover error('invalid value for %s: %s' % (col, concept[col]), name, line) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def main(args): # pragma: no cover bib = Database.from_file(args.data_file(args.version, 'refs.bib'), encoding='utf8') count = 0 skipped = 0 changes = {} with transaction.manager: update_providers(args) DBSession.flush() provider_map = get_map(Provider) macroarea_map = get_map(Macroarea) doctype_map = get_map(Doctype) known_ids = set(r[0] for r in DBSession.query(Ref.pk)) languoid_map = {} for l in DBSession.query(Languoid): if l.hid: languoid_map[l.hid] = l languoid_map[l.id] = l for i, rec in enumerate(bib): if i and i % 1000 == 0: print i, 'records done', count, 'changed' if len(rec.keys()) < 6: # not enough information! skipped += 1 continue changed = False assert rec.get('glottolog_ref_id') id_ = int(rec.get('glottolog_ref_id')) if args.mode != 'update' and id_ in known_ids: continue ref = DBSession.query(Source).get(id_) update = True if ref else False kw = { 'pk': id_, 'bibtex_type': rec.genre, 'id': str(id_), 'jsondata': {'bibtexkey': rec.id}, } for source, target in FIELD_MAP.items(): if target is None: continue value = rec.get(source) if value: value = unescape(value) if target: kw[target] = CONVERTER.get(source, lambda x: x)(value) else: kw['jsondata'][source] = value if kw['jsondata'].get('hhtype'): trigger = ca_trigger(kw['jsondata']['hhtype']) if trigger: kw['ca_doctype_trigger'], kw['jsondata']['hhtype'] = trigger # try to extract numeric year, startpage, endpage, numberofpages, ... if kw.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) else: match = YEAR_PATTERN.search(kw.get('year')) if match: kw['year_int'] = int(match.group('year')) if kw.get('publisher'): p = kw.get('publisher') if ':' in p: address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)] if not 'address' in kw or kw['address'] == address: kw['address'], kw['publisher'] = address, publisher if rec.get('numberofpages'): try: kw['pages_int'] = int(rec.get('numberofpages').strip()) except ValueError: pass if kw.get('pages'): start, end, number = compute_pages(kw['pages']) if start is not None: kw['startpage_int'] = start if end is not None: kw['endpage_int'] = end if number is not None and 'pages_int' not in kw: kw['pages_int'] = number if update: for k in kw.keys(): if k == 'pk': continue v = getattr(ref, k) if kw[k] != v: if k == 'jsondata': d = ref.jsondata or {} d.update(**kw[k]) for s, t in FIELD_MAP.items(): if t is None and s in d: del d[s] ref.jsondata = d else: print k, '--', v print k, '++', kw[k] setattr(ref, k, kw[k]) changed = True if ref.id in changes: changes[ref.id][k] = ('%s' % v, '%s' % kw[k]) else: changes[ref.id] = {k: ('%s' % v, '%s' % kw[k])} else: changed = True ref = Ref(name='%s %s' % (kw.get('author', 'na'), kw.get('year', 'nd')), **kw) ref.description = ref.title or ref.booktitle ref.name = '%s %s' % (ref.author or 'n.a.', ref.year or 'n.d.') def append(attr, obj): if obj and obj not in attr: attr.append(obj) return True a, r = update_relationship( ref.macroareas, [macroarea_map[name] for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('macro_area', '').split(',')]))]) changed = changed or a or r for name in set(filter(None, [s.strip() for s in kw['jsondata'].get('src', '').split(',')])): result = append(ref.providers, provider_map[slug(name)]) changed = changed or result a, r = update_relationship( ref.doctypes, [doctype_map[m.group('name')] for m in DOCTYPE_PATTERN.finditer(kw['jsondata'].get('hhtype', ''))]) changed = changed or a or r if not update: DBSession.add(ref) if changed: count += 1 ref.doctypes_str = ', '.join(o.id for o in ref.doctypes) ref.providers_str = ', '.join(o.id for o in ref.providers) print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' DBSession.execute("update source set description = title where description is null and title is not null;") DBSession.execute("update source set description = booktitle where description is null and booktitle is not null;") for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0")): pk, pages, number, start = row _start, _end, _number = compute_pages(pages) if _number > 0 and _number != number: DBSession.execute( "update source set pages_int = %s, startpage_int = %s where pk = %s", (_number, _start, pk)) DBSession.execute( "update ref set endpage_int = %s where pk = %s", (_end, pk)) return changes
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) for i, name in enumerate( ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add(common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file(data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add( common.Source_files(mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add(models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add( models.Relation(source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference(source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add(common.Contributor, name, id=slug(name), name=name) DBSession.add( common.ContributionContributor(ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split( ): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add( common.Contribution_data(object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider(id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta(metaprovider=provider, conceptset=data['ConceptSet'] [meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def main(args): if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def import_dataset(path, data, languoids, invalid_features, add_missing_features=False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) contrib = GrambankContribution(id=basename, name=basename, desc=languoids[basename].name) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add(Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add( ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = { f['properties']['glottocode']: f for f in md.get('features', []) } for i, row in enumerate( reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: invalid_features.update([row['Feature_ID']]) continue language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: languoid = languoids.get(row['Language_ID']) if languoid is None: print('Skipping, no Glottocode found for %s' % row['Language_ID']) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude } lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry'][ 'coordinates'] language = data.add(GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) domain = {de.abbr: de for de in parameter.domain} if not domain.get(row['Value']): #print "skipped", row, "not in", domain continue vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add(ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) name = row['Value'] if name in domain: name = domain[name].name data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def main(args): data = Data() for rec in Database.from_file( data_path('references.bib'), lowercase=False): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id=clts.__name__, name="CLTS", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Thiago Chacon', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t', namedtuples=True)): if not i % 100: print('-', end="") key = line.NAME.replace(' ', '_') data.add( models.SoundSegment, key, id=key, name = line.NAME, grapheme=line.GRAPHEME, aliases=line.ALIASES, representation=len(line.REFLEXES.split(',')), reflexes = line.REFLEXES, generated = True if line.GENERATED else False, unicode = line.UNICODE, ) print('') english = data.add( common.Language, 'eng', id='eng', name='English') contributions = {} for line in reader(data_path('datasets.tsv'), delimiter='\t', namedtuples=True): contributions[line.NAME] = data.add( models.CLTSDataSet, line.NAME, id=line.NAME, name=line.NAME, description=line.DESCRIPTION, datatype=line.TYPE ) for id_ in line.REFS.split(', '): common.ContributionReference( source=data['Source'][id_], contribution=contributions[line.NAME]) visited = set() for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t", namedtuples=True)): if not i % 100: print('-', end='') key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME if key not in visited: sound_id = line.NAME.replace(' ', '_') vs = common.ValueSet( id=key, description=line.NAME, language=english, contribution=contributions[line.DATASET], parameter=data['SoundSegment'][sound_id] ) data.add( models.Grapheme, key, id=key, grapheme=line.GRAPHEME, bipa_grapheme=line.BIPA, name=line.NAME, dataset=line.DATASET, datatype=line.DATATYPE, frequency=line.FREQUENCY or 0, image=line.IMAGE, url=line.URL, valueset=vs ) visited.add(key) print('-')
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add( common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file( data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add(common.Source_files( mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add( models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add(models.Relation( source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference( source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add( common.Contributor, name, id=slug(name), name=name) DBSession.add(common.ContributionContributor( ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add(common.Contribution_data( object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider( id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta( metaprovider=provider, conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def import_dataset(path, data, icons): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() contrib = Contribution(id=basename, name=basename) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: print('skip value for invalid feature %s' % row['Feature_ID']) continue #parameter = data.add( # Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! languoid = glottolog.languoid(row['Language_ID']) gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name Value( id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def import_dataset(path, data, icons, add_missing_features = False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() try: contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name) except: print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename)) return md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in pandas.io.parsers.read_csv( path, sep=',' if 'c' in ext else '\t', encoding='utf-16').iterrows(): if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']): print("Expected columns not found: ", row) continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: print(('skip value for invalid feature %s' % row['Feature_ID'])) continue language = data['CulturebankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! try: languoid = glottolog.languoid(row['Language_ID']) except AttributeError: print(('Skipping, no Glottocode found for %s' % row['Language_ID'])) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( CulturebankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name else: name = str(name) if name in domain: name = domain[name].name else: raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format( row['Language_ID'], row['Feature_ID'], name, {d: de for d, de in domain.items()})) data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) print(".", end="") if vs.source is not None: for key, src in list(data['Source'].items()): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))
def update_reflang(args): with open(args.data_file('brugmann_noderefs.json')) as fp: brugmann_noderefs = json.load(fp) ignored, obsolete, changed, unknown = 0, 0, 0, {} languoid_map = {} for l in DBSession.query(Languoid).filter(Languoid.hid != None): languoid_map[l.hid] = l.pk lgcodes = {} for rec in Database.from_file( args.data_file(args.version, 'refs.bib'), encoding='utf8'): lgcode = '' for f in 'lgcode lcode lgcde lgcoe lgcosw'.split(): if rec.get(f): lgcode = rec[f] break if len(lgcode) == 3 or lgcode.startswith('NOCODE_'): lgcode = '[' + lgcode + ']' lgcodes[rec.get('glottolog_ref_id', None)] = lgcode #for ref in DBSession.query(Ref).order_by(desc(Source.pk)).limit(10000): for ref in page_query( DBSession.query(Ref).order_by(desc(Source.pk)), n=10000, commit=True, verbose=True): # disregard iso change requests: if ref.description and ref.description.startswith('Change Request Number '): ignored += 1 continue if ref.id not in lgcodes: # remove all language relations for refs no longer in bib! update_relationship(ref.languages, []) obsolete += 1 continue language_note = lgcodes[ref.id] trigger = ca_trigger(language_note) if trigger: ref.ca_language_trigger, ref.language_note = trigger else: ref.language_note = language_note remove = brugmann_noderefs['delete'].get(str(ref.pk), []) # keep relations to non-language languoids: langs = [ l for l in ref.languages if (l.level != LanguoidLevel.language or not l.active) and l.pk not in remove] langs_pk = [l.pk for l in langs] # add relations from filemaker data: for lpk in brugmann_noderefs['create'].get(str(ref.pk), []): if lpk not in langs_pk: l = Languoid.get(lpk, default=None) if l: #print 'relation added according to brugmann data' langs.append(l) langs_pk.append(l.pk) else: print 'brugmann relation for non-existing languoid' for code in set(get_codes(ref)): if code not in languoid_map: unknown[code] = 1 continue lpk = languoid_map[code] if lpk in remove: print ref.name, ref.id, '--', l.name, l.id print 'relation removed according to brugmann data' else: if lpk not in langs_pk: langs.append(DBSession.query(Languoid).get(lpk)) langs_pk.append(lpk) a, r = update_relationship(ref.languages, langs) if a or r: changed += 1 print ignored, 'ignored' print obsolete, 'obsolete' print changed, 'changed' print 'unknown codes', unknown.keys()
def get_bib(args): return Database.from_file(args.data_dir.joinpath('scripts', 'monster-utf8.bib'))
print '--> unknown code:', code.encode('utf8') else: append(ref.languages, languoid_map[code]) for glottocode in filter(None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' if __name__ == '__main__': args = parsed_args((('--mode',), dict(default='insert'))) main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'), args.mode)
def main(args): def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes(data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace('/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict(url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files(object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def main(args): # pragma: no cover data = Data() clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data' clts_repos = CLTS(clts_repos) print(clts_repos.repos) version = 'v2.1.0' # assert_release(clts_repos.repos) for rec in Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id='clts', name="CLTS {0}".format(version), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate([ 'Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Robert Forkel', ]): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for line in args.cldf['data/features.tsv']: data.add( models.Feature, line['ID'], id=line['ID'], name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']), sound_type=line['TYPE'], feature=line['FEATURE'], value=line['VALUE'], ) DBSession.add(models.SoundSegment( id='NA', name='<NA>', description='<NA>', type='marker', generated=True, unicode='', color='#bbbbbb', )) for line in args.cldf['data/sounds.tsv']: s = data.add( models.SoundSegment, line['ID'], id=line['ID'], name=line['GRAPHEME'], description=line['NAME'], type=line['TYPE'], generated=line['GENERATED'], unicode=' / '.join(line['UNICODE']), color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']), ) if s.color == '0': s.color = '#bbbbbb' assert s.color in LEGEND DBSession.flush() seen = set() for line in args.cldf['data/sounds.tsv']: for fid in line['FEATURES']: spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk if (spk, fpk) not in seen: DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk)) seen.add((spk, fpk)) english = data.add( common.Language, 'eng', id='eng', name='English') for line in args.cldf['sources/index.tsv']: c = data.add( models.Transcription, line['NAME'], id=line['NAME'], name=line['NAME'], description=line['DESCRIPTION'].replace(':bib:', '/sources/'), datatype=getattr(models.Datatype, line['TYPE']) ) for ref in line.get('REFS', []): common.ContributionReference(source=data['Source'][ref], contribution=c) sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl for line in args.cldf['data/graphemes.tsv']: key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME'] if key not in data['Grapheme']: sound_id = line['NAME'].replace(' ', '_') vs = data['ValueSet'].get((line['DATASET'], line['NAME'])) if not vs: try: vs = data.add( common.ValueSet, (line['DATASET'], line['NAME']), id=key, description=line['NAME'], language=english, contribution=data['Transcription'][line['DATASET']], parameter=data['SoundSegment'][sound_id] ) except: print(line) raise data.add( models.Grapheme, key, id=key, name=line['GRAPHEME'], description=line['NAME'], url=line['URL'].unsplit() if line['URL'] else None, audio=sound_url_template.expand(line) if line['SOUND'] else None, image=image_url_template.expand(line) if line['IMAGE'] else None, valueset=vs )
def get_bib(args): return Database.from_file( args.data_dir.joinpath('scripts', 'monster-utf8.bib'))
append(ref.languages, languoid_map[code]) for glottocode in filter( None, kw['jsondata'].get('alnumcodes', '').split(';')): if glottocode not in languoid_map: print '--> unknown glottocode:', glottocode.encode('utf8') else: append(ref.languages, languoid_map[glottocode]) if not update: #pass # # TODO! # DBSession.add(ref) if i % 100 == 0: print i, 'records done' if changed: count += 1 print count, 'records updated or imported' print skipped, 'records skipped because of lack of information' if __name__ == '__main__': args = parsed_args((('--mode', ), dict(default='insert'))) main(Database.from_file(args.data_file('refs.bib'), encoding='utf8'), args.mode)
def main(args): # pragma: no cover get_repos() api = Grambank(REPOS['Grambank']) cldf = args.cldf data = Data() dataset = models.Grambank( id=grambank.__name__, name="Grambank", description="Grambank", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) contributors = {} for i, contrib in enumerate(api.contributors): contrib = common.Contributor( contrib.id, id=contrib.id, name=contrib.name, ) common.Editor(dataset=dataset, contributor=contrib, ord=i) DBSession.add(contrib) DBSession.flush() contributors[contrib.id] = contrib.pk contributions = {r['ID']: r for r in cldf['LanguageTable']} DBSession.add(dataset) for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)), desc='sources'): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() sources = {k: v.pk for k, v in data['Source'].items()} features, codes = import_features(cldf, contributors) transaction.commit() values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby( sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']), lambda r: r['Language_ID'], )] for lid, values in tqdm(values_by_sheet, desc='loading values'): transaction.begin() import_values(values, contributions[lid], features, codes, contributors, sources) transaction.commit() transaction.begin() glottolog = Glottolog(REPOS['glottolog']) languoids = {l.id: l for l in glottolog.languoids()} gblangs = DBSession.query(models.GrambankLanguage).all() load_families(data, gblangs, glottolog_repos=REPOS['glottolog'], isolates_icon='dcccccc') # Add isolates for lg in gblangs: gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family coverage.main(glottolog) return