def main(args): data = Data() dataset = common.Dataset( id=cognition.__name__, name="COSTATOL", description="Cognitive Structures across the Tree of Life", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cognition.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # for rec in Database.from_file(args.data_file('sources.bib')): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) contrib = common.Contribution(id='costatol', name='COSTATOL') for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True): param = data['Parameter'].get(datapoint['cognitive capacity']) if not param: name = datapoint['cognitive capacity'] param = data.add(common.Parameter, name, id=slug(name), name=name) species = data['Language'].get(datapoint['species']) if not species: name = datapoint['species'] species = data.add(common.Language, name, id=slug(name), name=name) vid = '%s-%s' % (species.id, param.id) vs = data.add( common.ValueSet, vid, id=vid, language=species, parameter=param, contribution=contrib) data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs) match = source_pattern.match(datapoint['source']) if match: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][match.group('key')], description=match.group('pages'))) for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True): data['Language'][species.name].longitude = species.longitude data['Language'][species.name].latitude = species.latitude
def main(args): features = reader(args.data_file('grambank_features.csv'), dicts=True, ) features = [GBFeature(f) for f in features] features = {'%s' % int(f.id[2:]): f for f in features} errors = [] db = create_engine('postgresql://robert@/glottolog3') for l in DBSession.query(Language): if l.id == 'qgr': continue gc = l.glottocode ma = db.execute(""" select m.id from macroarea as m, languoidmacroarea as lm, language as l where m.pk = lm.macroarea_pk and lm.languoid_pk = l.pk and l.id = '%s';""" % gc).fetchone()[0] if ma == 'pacific': ma = 'papunesia' errors.extend(export(args, l, features, gc, ma)) with UnicodeWriter(args.data_file('na_errors.tsv'), delimiter=b'\t') as writer: writer.writerow(['Language', 'Feature', 'Value', 'Source', 'Comment']) writer.writerows(errors)
def main(args, reload=False): species = {} db = args.data_file('theplantlist', 'db.json') if reload: for a in bs(get('/1.1/browse/-/')).find('ul', id='nametree').find_all('a'): with iopen(args.data_file('theplantlist', a.text + '.csv'), 'w', encoding='utf8') as fp: fp.write(get(a['href'] + a.text + '.csv')) if db.exists(): with open(db) as fp: species = json.load(fp) else: for p in args.data_file('theplantlist').files('*.csv'): for row in reader(p, namedtuples=True, delimiter=','): if row.Taxonomic_status_in_TPL == 'Accepted': id_ = slug(row.Genus + row.Species) species[id_] = row.ID with open(db, 'w') as fp: json.dump(species, fp) with transaction.manager: found = 0 for p in DBSession.query(Parameter): id_ = slug(p.name) if id_ in species: found += 1 p.tpl_id = species[id_] print(found)
def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None} } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert( ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n') conn.update( ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def download(args): data = dict(wikipedia={}, multitree=defaultdict(list)) for item in reader(args.data_file(DATA_FILE), namedtuples=True): if item.Glottolog and GC_PATTERN.match(item.Glottolog.strip()): data['wikipedia'][item.Glottolog.strip()] = item.Wiki.strip() for code in ll_codes(item): data['multitree'][item.Glottolog.strip()].append(code) return data
def upgrade(): csv = path(phoible.__file__).dirname().joinpath("..", "data", "InventoryID-InternetArchive.csv") ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != "NA"} conn = Connection(op.get_bind()) for id_, url in ia_urls.items(): pk = conn.pk(Contribution, id_) conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
def get_lginfo(args, filter=None): return [ (r.id, r) for r in dsv.reader( args.data_dir.joinpath('languoids', 'forkel_lginfo.tab'), fieldnames=['id', 'longitude', 'latitude', 'macro_area', 'year'], namedtuples=True) if filter is None or filter(r)]
def test_reader(self): from clld.lib.dsv import reader lines = ['first\tline', 'sücond\tläneß'] encoded_lines = [l.encode('utf8') for l in lines] csv_lines = [l.replace('\t', ',') for l in lines] def check(r): res = list(r) assert len(res) == 2 assert res[1][1] == 'läneß' check(reader(lines)) for lt in ['\n', '\r\n', '\r']: check(reader(StringIO(str(lt).join(encoded_lines)))) check(reader(TESTS_DIR.joinpath('csv.txt'), delimiter=',')) res = list(reader(TESTS_DIR.joinpath('test.tab'), namedtuples=True)) assert res[0].a_name == 'b' # Missing column values should be set to None: assert res[2].a_name is None r = list(reader(lines, dicts=True)) assert len(r) == 1 and r[0]['first'] == 'sücond' r = list(reader(lines, namedtuples=True)) assert len(r) == 1 and r[0].first == 'sücond' r = list(reader(csv_lines, namedtuples=True, delimiter=',')) assert len(r) == 1 and r[0].first == 'sücond'
def upgrade(): csv = path(phoible.__file__).dirname().joinpath( '..', 'data', 'InventoryID-InternetArchive.csv') ia_urls = {row[0]: row[1] for row in reader(csv) if row[1] != 'NA'} conn = Connection(op.get_bind()) for id_, url in ia_urls.items(): pk = conn.pk(Contribution, id_) conn.update(Inventory, dict(internetarchive_url=url), pk=pk)
def main(args): mapping = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): if row.GlyphID not in mapping: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) mapping[int(row.GlyphID)] = b16encode(md5(description).digest()) with open(args.data_file('segment_id_mapping.txt'), 'w') as fp: for gid in sorted(mapping.keys()): fp.write(' ("%s", "%s"),\n' % (gid, mapping[gid]))
def update(args): pid, cid = 'vitality', 'unesco' count = 0 notfound = {} contrib = common.Contribution.get(cid, default=None) if not contrib: contrib = common.Contribution( id=cid, name='Atlas of the World’s Languages in Danger', description='Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas') param = common.Parameter.get(pid, default=None) if param is None: param = common.Parameter( id=pid, name='Degree of endangerment') domain = {de.name: de for de in param.domain} for i, spec in enumerate(VITALITY_VALUES): name, desc = spec if name not in domain: number = i + 1 domain[name] = common.DomainElement( id='%s-%s' % (pid, number), name=name, description=desc, number=number, parameter=param) valuesets = {vs.id: vs for vs in param.valuesets} for item in reader(args.data_file(DATA_FILE), dicts=True): if item['ISO639-3 codes']: for code in item['ISO639-3 codes'].split(','): code = code.strip() lang = Languoid.get(code, key='hid', default=None) if lang: count += 1 item['url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code lang.update_jsondata(unesco=item) de = domain[item['Degree of endangerment']] vsid = '%s-%s' % (pid, lang.id) vs = valuesets.get(vsid) if not vs: vs = common.ValueSet( id='vitality-%s' % lang.id, parameter=param, contribution=contrib, language=lang) DBSession.add(common.Value(valueset=vs, name=de.name, domainelement=de)) valuesets[vsid] = vs else: vs.values[0].domainelement = de else: notfound[code] = 1 print 'assigned', count, 'unesco urls' print 'missing iso codes:', notfound
def import_features_collaborative_sheet(datadir, data): for feature in reader(os.path.join(datadir, 'features_collaborative_sheet.tsv'), dicts=True): feature = FeatureSpec(feature) f = data.add(Feature, feature.id, id=feature.id, name=feature.name, doc=feature.doc, patron=feature.patron, std_comments=feature.std_comments, name_french=feature.name_french, jl_relevant_unit=feature.jl_relevant_unit, jl_function=feature.jl_function, jl_formal_means=feature.jl_formal_means, hard_to_deny=feature.hard_to_deny, prone_misunderstanding=feature.prone_misunderstanding, requires_extensive_data=feature.requires_extensive_data, last_edited=feature.last_edited, other_survey=feature.other_survey) for i, (deid, desc) in enumerate(feature.domain.items()): DomainElement( id='%s-%s' % (f.id, deid), parameter=f, abbr=deid, name='%s - %s' % (deid, desc), number=int(deid) if deid != '?' else 999, description=desc, jsondata=dict(icon=ORDERED_ICONS[i].name))
def main(args): data = Data() # fetch language data from glottolog: glottolog = glottocodes_by_isocode( 'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude']) dataset = common.Dataset( id=jcld.__name__, name="Journal of Cross-Linguistic Databases", domain='jcld.clld.org') DBSession.add(dataset) contribution = data.add(common.Contribution, '1', id='1', name='fb') for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')): if row.Feature not in data['Parameter']: parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature) else: parameter = data['Parameter'][row.Feature] if row.Value not in data['DomainElement']: de = data.add( common.DomainElement, row.Value, id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value) else: de = data['DomainElement'][row.Value] if row.Language not in data['Language']: if row.Language not in glottolog: print '--->', row.Language continue glottocode, name, lat, lon = glottolog[row.Language] language = data.add( common.Language, row.Language, id=slug(row.Language), name=name, latitude=lat, longitude=lon) else: language = data['Language'][row.Language] id_ = str(i + 1) #'%s-%s' % (parameter.id, language.id) vs = common.ValueSet( id=id_, parameter=parameter, language=language, contribution=contribution, description=row.Comment, source=row.Source) common.Value(valueset=vs, name=row.Value, domainelement=de)
def update(args): codes = {} for lang in reader(args.data_file(DATA_FILE), namedtuples=True): codes[lang.LangID] = 1 count = 0 for lang in DBSession.query(Languoid)\ .filter(Languoid.hid != None)\ .filter(not_(icontains(Languoid.hid, 'nocode'))): if lang.hid in codes: lang.update_jsondata(ethnologue=LANGUAGE_URL + lang.hid) else: lang.update_jsondata(ethnologue=None) count += 1 print count, 'iso codes have no ethnologue code' ethnologue = args.json leafsets = defaultdict(list) for id_, doc in ethnologue['docs'].items(): for sid, spec in get_classification(id_, doc).items(): leafs = sorted(set([p[0] for p in spec[2]])) if leafs: leafsets[tuple(leafs)].append(sid) all = 0 matched = 0 for family in DBSession.query(Languoid)\ .filter(Languoid.level == LanguoidLevel.family)\ .filter(Language.active == True): leafs = [] all += 1 for row in DBSession.query(TreeClosureTable.child_pk, Languoid.hid)\ .filter(TreeClosureTable.parent_pk == family.pk)\ .filter(TreeClosureTable.child_pk == Languoid.pk)\ .filter(Languoid.hid != None): if len(row[1]) == 3: leafs.append(row[1]) leafs = tuple(sorted(set(leafs))) for i, subgroup in enumerate(leafsets.get(leafs, [])): if i == 0: matched += 1 family.update_jsondata(ethnologue=SUBGROUP_URL + subgroup) break print matched, 'of', all, 'families have an exact counterpart in ethnologue!'
def coordinates(args, languages): diff = lambda x, y: abs(x - y) > 0.001 for hid, lon, lat in dsv.reader(args.data_file("coordinates.tab")): if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue language = languages[hid] lat, lon = map(float, [lat, lon]) if not language.latitude or not language.longitude: language.longitude, language.latitude = lon, lat args.log.info("++ %s" % language.id) elif diff(language.longitude, lon) or diff(language.latitude, lat): language.longitude, language.latitude = lon, lat args.log.info("~~ %s" % language.id)
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None): if filter_ is None: filter_ = lambda r: True kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'} for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'): for row in list(reader(fname, **kw))[1:]: if row and filter_(row): try: obj = model.from_csv(row, data) except (KeyError, IndexError): obj = None print(fname) print(row) raise if obj: obj = data.add(model, row[0], _obj=obj) if visitor: visitor(obj, row, data)
def main(args): sources = jsonload(args.data_file('sources.json')) fields = ['href', 'name', 'author', 'iso', 'source', 'notes', 'wordlist'] with UnicodeWriter(args.data_file('..', 'sources.csv')) as fp: fp.writerow(fields) for source in sorted(sources, key=lambda i: i['name']): fp.writerow([source.get(f, '') for f in fields]) return ethnologue_names = { r.ISO_639: r.Language_Name for r in reader(args.data_file( '..', '..', 'ethnologue-17-data', 'Table_of_Languages.tab'), namedtuples=True)} # ASJP name for language, Ethnologue's name, ISO code rows = [['ASJP Name', 'Ethnologue name', 'ISO code']] subquery = DBSession.query(LanguageSource.language_pk).distinct().subquery() for i, l in enumerate(DBSession.query(Doculect).order_by(Doculect.pk).filter(not_(Doculect.pk.in_(subquery)))): rows.append([l.id, ethnologue_names.get(l.code_iso, ''), l.code_iso or '']) #print i with UnicodeWriter(args.data_file('..', 'doculects_without_source.csv')) as fp: fp.writerows(rows)
def countries(args, languages, stats): """update relations between languages and countries they are spoken in. """ cname_map = { 'Tanzania': 'Tanzania, United Republic of', 'Russia': 'Russian Federation', 'South Korea': 'Korea, Republic of', 'Iran': 'Iran, Islamic Republic of', 'Syria': 'Syrian Arab Republic', 'Laos': "Lao People's Democratic Republic", r"C\^ote d'Ivoire": "Côte d'Ivoire", 'British Virgin Islands': 'Virgin Islands, British', 'Bolivia': 'Bolivia, Plurinational State of', 'Venezuela': 'Venezuela, Bolivarian Republic of', 'Democratic Republic of the Congo': 'Congo, The Democratic Republic of the', 'Micronesia': 'Micronesia, Federated States of', } countries = {} for row in dsv.reader( args.data_dir.joinpath('languoids', 'forkel_countries.tab'), encoding='latin1'): hid, cnames = row[0], row[1:] if hid not in languages: languages[hid] = Languoid.get(hid, key='hid', default=None) if not languages[hid]: args.log.warn('unknown hid in countries.tab: %s' % hid) continue l = languages[hid] if l.countries: # we only add country relations to new languages or languages which have none. continue for cname in set(cnames): if cname not in countries: q = cname if '(' not in cname else cname.split('(')[0].strip() countries[cname] = Country.get(cname_map.get(q, q), key='name', default=None) if not countries[cname]: args.log.warn('unknown country name in countries.tab: %s' % cname) continue c = countries[cname] if c.id not in [_c.id for _c in l.countries]: l.countries.append(c) stats.update(['countries'])
def main(args): # pragma: no cover # we merge information about extinct languages from unesco and Harald. extinct = dict(list(dsv.reader(args.data_file('extinct.tab')))) with transaction.manager: query = language_query().options( joinedload_all(Language.valuesets, ValueSet.values)) # loop over active, established languages with geo-coords for l in page_query(query, n=100, verbose=True): # let's collect the relevant sources in a way that allows computation of med. # Note: we limit refs to the ones without computerized assignments. sources = DBSession.query(Ref).join(LanguageSource)\ .filter(LanguageSource.language_pk == l.pk) \ .filter(Ref.ca_doctype_trigger == None)\ .filter(Ref.ca_language_trigger == None)\ .options(joinedload(Ref.doctypes)) sources = sorted(map(Source, sources)) # keep the overall med # note: this source may not be included in the potential meds computed below, # e.g. because it may not have a year. med = sources[0].__json__() if sources else None # now we have to compute meds respecting a cut-off year. # to do so, we collect eligible sources per year and then # take the med of this collection. potential_meds = [] # we only have to loop over publication years within all sources, because # only in these years something better might have come along. for year in set(s.year for s in sources if s.year): # let's see if something better was published! eligible = [s for s in sources if s.year and s.year <= year] if eligible: potential_meds.append(sorted(eligible)[0]) # we store the precomputed sources information as jsondata: l.update_jsondata( endangerment='Extinct' if l.hid in extinct else l.endangerment, med=med, sources=[s.__json__() for s in sorted(set(potential_meds), key=lambda s: -s.year)])
def countries(args, languages): count = 0 countries = {} for row in dsv.reader(args.data_file("countries.tab"), encoding="latin1"): hid, cnames = row[0], row[1:] if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue l = languages[hid] if l.countries: continue for cname in set(cnames): if cname not in countries: countries[cname] = Country.get(cname, key="name", default=None) if not countries[cname]: continue c = countries[cname] if c.id not in [_c.id for _c in l.countries]: l.countries.append(c) count += 1 print "countries:", count, "relations added"
def macroareas(args, languages): ma_map = get_map(Macroarea) # we store references to languages to make computation of cumulated macroareas for # families easier lang_map = {} for hid, macroarea in dsv.reader(args.data_file("macroareas.tab")): if hid not in languages: languages[hid] = Languoid.get(hid, key="hid", default=None) if not languages[hid]: continue lang_map[languages[hid].pk] = languages[hid] update_relationship(languages[hid].macroareas, [ma_map[macroarea]], log=args.log) for family in ( DBSession.query(Languoid).filter(Languoid.level == LanguoidLevel.family).filter(Language.active == True) ): mas = [] for lang in DBSession.query(TreeClosureTable.child_pk).filter(TreeClosureTable.parent_pk == family.pk): if lang[0] in lang_map: mas.extend(lang_map[lang[0]].macroareas) update_relationship(family.macroareas, mas, log=args.log) print "macroareas done"
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def import_dataset(path, data, icons): # look for metadata # look for sources # then loop over values dirpath, fname = os.path.split(path) basename, ext = os.path.splitext(fname) glottolog = Glottolog() contrib = Contribution(id=basename, name=basename) md = {} mdpath = path + '-metadata.json' if os.path.exists(mdpath): md = jsonload(mdpath) contributor_name = HumanName(md.get('contributed_datapoint', 'Team NTS')) contributor_id = slug(contributor_name.last + contributor_name.first) contributor = data['Contributor'].get(contributor_id) if not contributor: contributor = data.add( Contributor, contributor_id, id=contributor_id, name='%s' % contributor_name) DBSession.add(ContributionContributor(contribution=contrib, contributor=contributor)) bibpath = os.path.join(dirpath, basename + '.bib') if os.path.exists(bibpath): for rec in Database.from_file(bibpath): if rec['key'] not in data['Source']: data.add(Source, rec['key'], _obj=bibtex2source(rec)) languages = {f['properties']['glottocode']: f for f in md.get('features', [])} for i, row in enumerate(reader(path, dicts=True, quoting=csv.QUOTE_NONE, delimiter=',' if 'c' in ext else '\t')): if not row['Value'] or not row['Feature_ID']: continue vsid = '%s-%s-%s' % (basename, row['Language_ID'], row['Feature_ID']) vid = row.get('ID', '%s-%s' % (basename, i + 1)) parameter = data['Feature'].get(row['Feature_ID']) if parameter is None: print('skip value for invalid feature %s' % row['Feature_ID']) continue #parameter = data.add( # Feature, row['Feature_ID'], id=row['Feature_ID'], name=row.get('Feature', row['Feature_ID'])) language = data['GrambankLanguage'].get(row['Language_ID']) if language is None: # query glottolog! languoid = glottolog.languoid(row['Language_ID']) gl_md = { 'name': languoid.name, 'longitude': languoid.longitude, 'latitude': languoid.latitude} lmd = languages.get(row['Language_ID']) if lmd: if lmd.get('properties', {}).get('name'): gl_md['name'] = lmd['properties']['name'] if lmd.get('geometry', {}).get('coordinates'): gl_md['longitude'], gl_md['latitude'] = lmd['geometry']['coordinates'] language = data.add( GrambankLanguage, row['Language_ID'], id=row['Language_ID'], name=gl_md['name'], latitude=gl_md.get('latitude'), longitude=gl_md.get('longitude')) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=parameter, language=language, contribution=contrib, source=row['Source']) domain = {de.abbr: de for de in parameter.domain} name = row['Value'] if name in domain: name = domain[name].name Value( id=vid, valueset=vs, name=name, description=row['Comment'], domainelement=domain.get(row['Value'])) for key, src in data['Source'].items(): if key in vs.source: ValueSetReference(valueset=vs, source=src, key=key)
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def get_rows(args, name): for i, row in enumerate(reader(args.data_file('InventoryID-%s.csv' % name))): if i and row[1] != 'NA': yield row
def justifications(args, languages): """ - text goes into ValueSet.description - refs go into ValueSetReference objects """ def normalized_pages(s): if PAGES_PATTERN.match(s or ""): return s or "" # # create mappings to look up glottolog languoids matching names in justification files # langs_by_hid = languages langs_by_hname = {} langs_by_name = {} for l in DBSession.query(Languoid).filter(Languoid.active == False): langs_by_hname[l.jsondatadict.get("hname")] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l for l in DBSession.query(Languoid).filter(Languoid.active == True): langs_by_hname[l.jsondatadict.get("hname")] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l for id_, type_ in [("fc", "family"), ("sc", "subclassification")]: for i, row in enumerate(dsv.reader(args.data_file("%s_justifications.tab" % type_))): name = row[0] name = name.replace("_", " ") if not name.startswith("NOCODE") else name l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name))) if not l: args.log.warn("ignoring %s" % name) continue _r = 3 if type_ == "family" else 2 comment = (row[_r].strip() or None) if len(row) > _r else None if comment and not WORD_PATTERN.search(comment): comment = None # # TODO: look for [NOCODE_ppp] patterns as well!? # refs = [(int(m.group("id")), normalized_pages(m.group("comment"))) for m in REF_PATTERN.finditer(row[2])] vs = None for _vs in l.valuesets: if _vs.parameter.id == id_: vs = _vs break if not vs: args.log.info("%s %s ++" % (l.id, type_)) vs = ValueSet( id="%s%s" % (type_, l.id), description=comment, language=l, parameter=Parameter.get(id_), contribution=Contribution.first(), ) DBSession.add(Value(id="%s%s" % (type_, l.id), name="%s - %s" % (l.level, l.status), valueset=vs)) DBSession.flush() else: if vs.description != comment: args.log.info("%s %s ~~ description" % (l.id, type_)) vs.description = comment for r in vs.references: DBSession.delete(r) for r, pages in refs: vs.references.append(ValueSetReference(source=Source.get(str(r)), description=pages)) args.log.info("%s %s" % (i, type_))
def get_vs2008(args): # pragma: no cover vs2008 = {} for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','): vs2008[(row[0], '%sA' % row[1])] = int(row[2]) return vs2008
def prime_cache(args): """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row['language_id'], row['feature_id']) old_value = vs2008.get(key) new_value = row['value_numeric'] if old_value and old_value != new_value: valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row['feature_id'])\ .filter(common.Language.id == row['language_id'])\ .one() value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'): valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row.feature)\ .filter(common.Language.id == row.wals_code)\ .one() value = valueset.values[0] if value.domainelement.number == int(row.new): print '**** old news', valueset.language.id, valueset.parameter.id continue if value.domainelement.number != int(row.old): print '--->', valueset.language.id, valueset.parameter.id, value.domainelement.number for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print 'corrections 2013 done' for issue in ['0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28']: issue = getattr(issues, 'issue' + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter): parameter.representation = str(len(set(v.language_pk for v in valuesets))) print 'recomputation of representation done' transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options(joinedload_all( common.Language.languageidentifier, common.LanguageIdentifier.identifier )): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ', '.join(sorted(set(iso_codes))) print 'recomputation of iso codes done' transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func('update', args)
def get_vs2008(args): vs2008 = {} for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','): vs2008[(row[0], '%sA' % row[1])] = int(row[2]) return vs2008
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License'}) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby( sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add( models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add( common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add(models.ContributorReference( source=data['Source'][ref], contributor=contributor)) contrib = data.add( models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files( object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join( [t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet( id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add(common.ValueSetReference( source=data['Source'][ref], valueset=vs)) DBSession.add(common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add( common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add(common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def prime_cache(args): # pragma: no cover """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row['language_id'], row['feature_id']) old_value = vs2008.get(key) new_value = row['value_numeric'] if old_value and old_value != new_value: valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row['feature_id'])\ .filter(common.Language.id == row['language_id'])\ .one() value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'): valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row.feature)\ .filter(common.Language.id == row.wals_code)\ .one() value = valueset.values[0] if value.domainelement.number == int(row.new): print('**** old news', valueset.language.id, valueset.parameter.id) continue if value.domainelement.number != int(row.old): print('--->', valueset.language.id, valueset.parameter.id, value.domainelement.number) for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print('corrections 2013 done') for issue in [ '0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28' ]: issue = getattr(issues, 'issue' + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by( common.ValueSet.parameter_pk), lambda vs: vs.parameter): parameter.representation = str( len(set(v.language_pk for v in valuesets))) print('recomputation of representation done') transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options( joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier)): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ', '.join(sorted(set(iso_codes))) print('ecomputation of iso codes done') transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func('update', args)
def justifications(args, languages, stats): """ - text goes into ValueSet.description - refs go into ValueSetReference objects """ hh_bibkey_to_glottolog_id = {} for rec in get_bib(args): for provider, bibkeys in get_bibkeys(rec).items(): if provider == 'hh': for bibkey in bibkeys: hh_bibkey_to_glottolog_id[bibkey] = rec['glottolog_ref_id'] break def substitute_hh_bibkeys(m): return '**%s**' % hh_bibkey_to_glottolog_id[m.group('bibkey')] # # create mappings to look up glottolog languoids matching names in justification files # langs_by_hid = languages langs_by_hname = {} langs_by_name = {} # order by active to make sure, we active languoid overwrite the data of obsolete ones. for l in DBSession.query(Languoid).order_by(Languoid.active): langs_by_hname[l.jsondata.get('hname')] = l langs_by_hid[l.hid] = l langs_by_name[l.name] = l def normalize_pages(s): return (s or '').strip().rstrip(',') or None for id_, type_ in [('fc', 'family'), ('sc', 'subclassification')]: for i, row in enumerate(dsv.reader( args.data_dir.joinpath('languoids', 'forkel_%s_justifications-utf8.tab' % type_))): name = row[0] name = name.replace('_', ' ') if not name.startswith('NOCODE') else name l = langs_by_hname.get(name, langs_by_hid.get(name, langs_by_name.get(name))) if not l: args.log.warn('ignoring %s' % name) continue _r = 3 if type_ == 'family' else 2 comment = (row[_r].strip() or None) if len(row) > _r else None if comment and not WORD_PATTERN.search(comment): comment = None if comment: comment = re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, comment) # # TODO: look for [NOCODE_ppp] patterns as well!? # refs = [(int(m.group('id')), normalize_pages(m.group('pages'))) for m in REF_PATTERN.finditer( re.sub('\*\*(?P<bibkey>[^\*]+)\*\*', substitute_hh_bibkeys, row[2]))] vs = None for _vs in l.valuesets: if _vs.parameter.id == id_: vs = _vs break if not vs: args.log.info('%s %s ++' % (l.id, type_)) vs = ValueSet( id='%s%s' % (id_, l.pk), description=comment, language=l, parameter=Parameter.get(id_), contribution=Contribution.first()) DBSession.add(Value( id='%s%s' % (id_, l.pk), name='%s - %s' % (l.level, l.status), valueset=vs)) DBSession.flush() else: if vs.description != comment: args.log.info('%s %s ~~ description: %s ---> %s' % (l.id, type_, vs.description, comment)) vs.description = comment stats.update(['justifications-%s' % type_]) for r in vs.references: DBSession.delete(r) for r, pages in refs: # FIXME: we must make sure not to link sources which will subsequently be # replaced! vs.references.append(ValueSetReference( source=Source.get(str(r)), description=pages)) args.log.info('%s %s' % (i, type_))
def get_vs2008(args): # pragma: no cover vs2008 = {} for row in reader(args.data_file("datapoints_2008.csv"), delimiter=","): vs2008[(row[0], "%sA" % row[1])] = int(row[2]) return vs2008
def read(table): return list(dsv.reader( args.data_file(table + '.csv'), delimiter=',', namedtuples=True))
def prime_cache(args): # pragma: no cover """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row["language_id"], row["feature_id"]) old_value = vs2008.get(key) new_value = row["value_numeric"] if old_value and old_value != new_value: valueset = ( VersionedDBSession.query(common.ValueSet) .join(common.Language) .join(common.Parameter) .filter(common.Parameter.id == row["feature_id"]) .filter(common.Language.id == row["language_id"]) .one() ) value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file("corrections_2013.tab"), namedtuples=True, newline="\r"): valueset = ( VersionedDBSession.query(common.ValueSet) .join(common.Language) .join(common.Parameter) .filter(common.Parameter.id == row.feature) .filter(common.Language.id == row.wals_code) .one() ) value = valueset.values[0] if value.domainelement.number == int(row.new): print("**** old news", valueset.language.id, valueset.parameter.id) continue if value.domainelement.number != int(row.old): print("--->", valueset.language.id, valueset.parameter.id, value.domainelement.number) for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print("corrections 2013 done") for issue in ["0", "9", "10", "11", "13", "14", "15", "16", "17", "19", "20", "24", "26", "27", "28"]: issue = getattr(issues, "issue" + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter ): parameter.representation = str(len(set(v.language_pk for v in valuesets))) print("recomputation of representation done") transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options( joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier) ): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ", ".join(sorted(set(iso_codes))) print("ecomputation of iso codes done") transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func("update", args)
def get_tab(name): """generator for entries in a tab file specified by name. """ return dsv.reader(get(get_taburls()[name]).split('\n'), namedtuples=True)