def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None} } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert( ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n') conn.update( ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add(Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add(Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def load_ecoregions(data_file, data): ecoregions = jsonload(data_file('ecoregions.json'))['features'] biome_map = { 1: ('Tropical & Subtropical Moist Broadleaf Forests', '008001'), 2: ('Tropical & Subtropical Dry Broadleaf Forests', '557715'), 3: ('Tropical & Subtropical Coniferous Forests', ''), 4: ('Temperate Broadleaf & Mixed Forests', ''), 5: ('Temperate Conifer Forests', ''), 6: ('Boreal Forests/Taiga', ''), 7: ('Tropical & Subtropical Grasslands, Savannas & Shrublands', '98ff66'), 8: ('Temperate Grasslands, Savannas & Shrublands', ''), 9: ('Flooded Grasslands & Savannas', '0265fe'), 10: ('Montane Grasslands & Shrublands', 'cdffcc'), 11: ('Tundra', ''), 12: ('Mediterranean Forests, Woodlands & Scrub', 'cc9900'), 13: ('Deserts & Xeric Shrublands', 'feff99'), 14: ('Mangroves', '870083'), } for eco_code, features in groupby( sorted(ecoregions, key=lambda e: e['properties']['eco_code']), key=lambda e: e['properties']['eco_code']): features = list(features) props = features[0]['properties'] if int(props['BIOME']) not in biome_map: continue biome = data['Biome'].get(props['BIOME']) if not biome: name, color = biome_map[int(props['BIOME'])] biome = data.add( Biome, props['BIOME'], id=str(int(props['BIOME'])), name=name, description=color or 'ffffff') centroid = (None, None) f = sorted(features, key=lambda _f: _f['properties']['AREA'])[-1] if f['geometry']: coords = f['geometry']['coordinates'][0] if f['geometry']['type'] == 'MultiPolygon': coords = coords[0] centroid = get_center(coords) polygons = nfilter([_f['geometry'] for _f in features]) data.add( Ecoregion, eco_code, id=eco_code, name=props['ECO_NAME'], description=props['G200_REGIO'], latitude=centroid[1], longitude=centroid[0], biome=biome, area=props['area_km2'], gbl_stat=Ecoregion.gbl_stat_map[int(props['GBL_STAT'])], realm=Ecoregion.realm_map[props['REALM']], jsondata=dict(polygons=polygons))
def stability(req): fs = jsonload(abspath_from_asset_spec('grambank:static/stability.json')) #lfv = DBSession.query(Value).join(Value.valueset)\ # .options( # joinedload(Value.valueset, ValueSet.language), # joinedload(Value.valueset, ValueSet.parameter) # ) #trp = [(v.id,) for v in lfv] #sorted(fs.items(), key = lambda (f, s): s, reverse=True) return {'data': fs}
def stability(req): fs = jsonload(abspath_from_asset_spec('culturebank:static/stability.json')) #lfv = DBSession.query(Value).join(Value.valueset)\ # .options( # joinedload(Value.valueset, ValueSet.language), # joinedload(Value.valueset, ValueSet.parameter) # ) #trp = [(v.id,) for v in lfv] #sorted(fs.items(), key = lambda (f, s): s, reverse=True) return {'data': fs}
def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': { 'sort': int(example['Order_number']), 'alt_translation': None } } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert(ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate( reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment[ 'Comments_on_value_assignment'].replace('\x0b', '\n') conn.update(ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def dependency(req): print "HEJ2" deps = jsonload(abspath_from_asset_spec('grambank:static/dependencies.json')) #lfv = DBSession.query(Value).join(Value.valueset)\ # .options( # joinedload(Value.valueset, ValueSet.language), # joinedload(Value.valueset, ValueSet.parameter) # ) #trp = [(v.id,) for v in lfv] #sorted(fs.items(), key = lambda (f, s): s, reverse=True) return {'data': deps}
def coverage(req): gl = jsonload( abspath_from_asset_spec('grambank:static/stats_by_macroarea.json')) stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for ma in gl: for dt in gl[ma]: ids = gl[ma][dt] isolates = select([Language.__table__.c.id ]).where(Language.__table__.c.id.in_(ids)) families = select([Family.__table__.c.id ]).where(Family.__table__.c.id.in_(ids)) stats[ma][dt] = dict( glottolog=len(ids), grambank=DBSession.query( isolates.union(families).alias('u')).count()) stats[ma]['total'] = {} for src in ['glottolog', 'grambank']: stats[ma]['total'][src] = \ stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src] gl = jsonload( abspath_from_asset_spec( 'grambank:static/stats_by_classification.json')) gb_langs = set([r[0] for r in DBSession.query(Language.id)]) cstats = OrderedDict() for fid, spec in sorted(gl.items(), key=lambda k: k[1]['name']): d = dict(macroareas=spec['macroareas'], grammar=Counter(), grammarsketch=Counter(), total=Counter(), covered=gb_langs.intersection(set(spec['extension'])), isolate=not bool(spec.get('subgroups')), subgroups={}) if not spec.get('subgroups'): # an isolate! d[spec['doctype']].update(['glottolog']) d['total'].update(['glottolog']) if gb_langs.intersection(set(spec['extension'])): d[spec['doctype']].update(['grambank']) d['total'].update(['grambank']) for sfid, sub in spec.get('subgroups', {}).items(): if not sub.get('subgroups'): sub['name'] = '%s*' % sub['name'] d[sub['doctype']].update(['glottolog']) d['total'].update(['glottolog']) if gb_langs.intersection(set(sub['extension'])): d[sub['doctype']].update(['grambank']) d['total'].update(['grambank']) d['subgroups'][(sfid, sub['name'])] = dict( macroareas=spec['macroareas'], covered=gb_langs.intersection(set(sub['extension'])), grammar=Counter(), grammarsketch=Counter(), total=Counter()) if not sub.get('subgroups'): # a language attached directly to the top-level family d['subgroups'][(sfid, sub['name'])][sub['doctype']].update( ['glottolog']) d['subgroups'][(sfid, sub['name'])]['total'].update(['glottolog']) if gb_langs.intersection(set(sub['extension'])): d['subgroups'][(sfid, sub['name'])][sub['doctype']].update( ['grambank']) d['subgroups'][(sfid, sub['name'])]['total'].update(['grambank']) for ssfid, ssub in sub.get('subgroups', {}).items(): if ssub['doctype']: d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update( ['glottolog']) d['subgroups'][(sfid, sub['name'])]['total'].update( ['glottolog']) if gb_langs.intersection(set(ssub['extension'])): d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update( ['grambank']) d['subgroups'][(sfid, sub['name'])]['total'].update( ['grambank']) cstats[(fid, spec['name'])] = d return dict( stats=stats, cstats=cstats, macroareas=jsonload( abspath_from_asset_spec('grambank:static/stats_macroareas.json')))
def import_dataset(path, data, icons, add_missing_features = False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() try: contrib = CulturebankContribution(id=basename, name=basename, desc=glottolog.languoid(basename).name) except: print("Basename {:s} did not match a glottolog languoid, skipped.".format(basename)) return md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in pandas.io.parsers.read_csv( path, sep=',' if 'c' in ext else '\t', encoding='utf-16').iterrows(): if pandas.isnull(row['Value']) or pandas.isnull(row['Feature_ID']): print("Expected columns not found: ", row) continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: print(('skip value for invalid feature %s' % row['Feature_ID'])) continue language = data['CulturebankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! try: languoid = glottolog.languoid(row['Language_ID']) except AttributeError: print(('Skipping, no Glottocode found for %s' % row['Language_ID'])) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( CulturebankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name else: name = str(name) if name in domain: name = domain[name].name else: raise ValueError("For feature {:s} in language {:s}: Name {:s} not found among domain values {:}".format( row['Language_ID'], row['Feature_ID'], name, {d: de for d, de in domain.items()})) data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) print(".", end="") if vs.source is not None: for key, src in list(data['Source'].items()): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def main(args): if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def import_dataset(path, data, languoids, invalid_features, add_missing_features=False): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) contrib = GrambankContribution(id=basename, name=basename, desc=languoids[basename].name) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add(Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add( ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = { f['properties']['glottocode']: f for f in md.get('features', []) } for i, row in enumerate( reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: if add_missing_features: parameter = data.add(Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) else: invalid_features.update([row['Feature_ID']]) continue language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: languoid = languoids.get(row['Language_ID']) if languoid is None: print('Skipping, no Glottocode found for %s' % row['Language_ID']) continue gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude } lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry'][ 'coordinates'] language = data.add(GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) domain = {de.abbr: de for de in parameter.domain} if not domain.get(row['Value']): #print "skipped", row, "not in", domain continue vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add(ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) name = row['Value'] if name in domain: name = domain[name].name data.add(Value, vid, id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def coverage(req): gl = jsonload(abspath_from_asset_spec('culturebank:static/stats_by_macroarea.json')) stats = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) for ma in gl: for dt in gl[ma]: ids = gl[ma][dt] isolates = select( [Language.__table__.c.id]).where(Language.__table__.c.id.in_(ids)) families = select( [Family.__table__.c.id]).where(Family.__table__.c.id.in_(ids)) stats[ma][dt] = dict( glottolog=len(ids), culturebank=DBSession.query(isolates.union(families).alias('u')).count()) stats[ma]['total'] = {} for src in ['glottolog', 'culturebank']: stats[ma]['total'][src] = \ stats[ma]['grammar'][src] + stats[ma]['grammarsketch'][src] gl = jsonload(abspath_from_asset_spec('culturebank:static/stats_by_classification.json')) gb_langs = set([r[0] for r in DBSession.query(Language.id)]) cstats = OrderedDict() for fid, spec in sorted(list(gl.items()), key=lambda k: k[1]['name']): d = dict( macroareas=spec['macroareas'], grammar=Counter(), grammarsketch=Counter(), total=Counter(), covered=gb_langs.intersection(set(spec['extension'])), isolate=not bool(spec.get('subgroups')), subgroups={}) if not spec.get('subgroups'): # an isolate! d[spec['doctype']].update(['glottolog']) d['total'].update(['glottolog']) if gb_langs.intersection(set(spec['extension'])): d[spec['doctype']].update(['culturebank']) d['total'].update(['culturebank']) for sfid, sub in list(spec.get('subgroups', {}).items()): if not sub.get('subgroups'): sub['name'] = '%s*' % sub['name'] d[sub['doctype']].update(['glottolog']) d['total'].update(['glottolog']) if gb_langs.intersection(set(sub['extension'])): d[sub['doctype']].update(['culturebank']) d['total'].update(['culturebank']) d['subgroups'][(sfid, sub['name'])] = dict( macroareas=spec['macroareas'], covered=gb_langs.intersection(set(sub['extension'])), grammar=Counter(), grammarsketch=Counter(), total=Counter()) if not sub.get('subgroups'): # a language attached directly to the top-level family d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(['glottolog']) d['subgroups'][(sfid, sub['name'])]['total'].update(['glottolog']) if gb_langs.intersection(set(sub['extension'])): d['subgroups'][(sfid, sub['name'])][sub['doctype']].update(['culturebank']) d['subgroups'][(sfid, sub['name'])]['total'].update(['culturebank']) for ssfid, ssub in list(sub.get('subgroups', {}).items()): if ssub['doctype']: d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update(['glottolog']) d['subgroups'][(sfid, sub['name'])]['total'].update(['glottolog']) if gb_langs.intersection(set(ssub['extension'])): d['subgroups'][(sfid, sub['name'])][ssub['doctype']].update(['culturebank']) d['subgroups'][(sfid, sub['name'])]['total'].update(['culturebank']) cstats[(fid, spec['name'])] = d return dict( stats=stats, cstats=cstats, macroareas=jsonload( abspath_from_asset_spec('culturebank:static/stats_macroareas.json')))
def main(args): def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes(data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace('/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict(url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files(object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)