def test_glottocodes_by_isocode(mocker, env): from clld.scripts.util import glottocodes_by_isocode ce = mocker.Mock(return_value=mocker.Mock( execute=lambda *args: [('iso', 'abcd1234')])) mocker.patch('clld.scripts.util.create_engine', ce) assert glottocodes_by_isocode('dburi')['iso'] == 'abcd1234' json = """{ "properties": { "dataset": "glottolog", "uri_template": "http://glottolog.org/resource/languoid/id/{id}" }, "resources": [ { "id": "aant1238", "identifiers": [ { "identifier": "tbg-aan", "type": "multitree" } ], "latitude": null, "longitude": null, "name": "Aantantara" }, { "id": "aari1239", "identifiers": [ { "identifier": "aiw", "type": "iso639-3" }, { "identifier": "aiw", "type": "multitree" } ], "latitude": 5.95034, "longitude": 36.5721, "name": "Aari" }]}""" class Req(mocker.Mock): def get(self, *args): return mocker.Mock(json=mocker.Mock(return_value=loads(json))) mocker.patch('clld.scripts.util.requests', Req()) assert glottocodes_by_isocode(None, cols=['id', 'latitude'])['aiw'][0] == 'aari1239'
def test_glottocodes_by_isocode(self): from clld.scripts.util import glottocodes_by_isocode ce = Mock(return_value=Mock(execute=lambda *args: [('iso', 'abcd1234')])) with patch('clld.scripts.util.create_engine', ce): assert glottocodes_by_isocode('dburi')['iso'] == 'abcd1234' json = """{ "properties": { "dataset": "glottolog", "uri_template": "http://glottolog.org/resource/languoid/id/{id}" }, "resources": [ { "id": "aant1238", "identifiers": [ { "identifier": "tbg-aan", "type": "multitree" } ], "latitude": null, "longitude": null, "name": "Aantantara" }, { "id": "aari1239", "identifiers": [ { "identifier": "aiw", "type": "iso639-3" }, { "identifier": "aiw", "type": "multitree" } ], "latitude": 5.95034, "longitude": 36.5721, "name": "Aari" }]}""" class Req(Mock): def get(self, *args): return Mock(json=Mock(return_value=loads(json))) with patch('clld.scripts.util.requests', Req()): assert glottocodes_by_isocode( None, cols=['id', 'latitude'])['aiw'][0] == 'aari1239'
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def main(args): data = Data() # fetch language data from glottolog: glottolog = glottocodes_by_isocode( 'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude']) dataset = common.Dataset( id=jcld.__name__, name="Journal of Cross-Linguistic Databases", domain='jcld.clld.org') DBSession.add(dataset) contribution = data.add(common.Contribution, '1', id='1', name='fb') for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')): if row.Feature not in data['Parameter']: parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature) else: parameter = data['Parameter'][row.Feature] if row.Value not in data['DomainElement']: de = data.add( common.DomainElement, row.Value, id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value) else: de = data['DomainElement'][row.Value] if row.Language not in data['Language']: if row.Language not in glottolog: print '--->', row.Language continue glottocode, name, lat, lon = glottolog[row.Language] language = data.add( common.Language, row.Language, id=slug(row.Language), name=name, latitude=lat, longitude=lon) else: language = data['Language'][row.Language] id_ = str(i + 1) #'%s-%s' % (parameter.id, language.id) vs = common.ValueSet( id=id_, parameter=parameter, language=language, contribution=contribution, description=row.Comment, source=row.Source) common.Value(valueset=vs, name=row.Value, domainelement=de)
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License'}) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby( sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add( models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add( common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add(models.ContributorReference( source=data['Source'][ref], contributor=contributor)) contrib = data.add( models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files( object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join( [t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet( id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add(common.ValueSetReference( source=data['Source'][ref], valueset=vs)) DBSession.add(common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add( common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add(common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add(common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") glottolog = glottocodes_by_isocode('postgresql://robert@/glottolog3') for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): try: add_language_codes( data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None) except: print(row) raise second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def main(args): meta = parse_meta(args) print(len(meta)) print(sum(len(m.sources) for m in meta.values())) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None print(len(sources), 'distinct') for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = { row[0]: row[0] for row in wals.execute('select id from genus') } with args.data_file('listss17.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'CB', 'EH']: DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add(models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) assert not list(meta.keys())
def main(args): meta = parse_meta(args) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = {row[0]: row[0] for row in wals.execute('select id from genus')} with args.data_file('listss18.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'EH', 'CB']: DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add( models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) print(list(meta.keys()))
def main(args): sources = get_sources(args) Index('ducet1', collkey(common.Value.name)).create(DBSession.bind) Index('ducet2', collkey(models.Counterpart.phonetic)).create(DBSession.bind) data = Data() glottocodes, geocoords = {}, defaultdict(lambda: (None, None)) for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] geocoords[k] = (v[1], v[2]) geocoords['win'] = (43.50, -88.50) dataset = common.Dataset( id=csd.__name__, name="Comparative Siouan Dictionary", description="Comparative Siouan Dictionary", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='csd.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='csd', name=dataset.name) for i, spec in enumerate([ ('Robert L. Rankin', True), ('Richard T. Carter', True), ('A. Wesley Jones', True), ('John E. Koontz', True), ('David S. Rood', True), ('Iren Hartmann', True), ]): name, primary = spec c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i, primary=primary)) d = Dictionary( args.data_file(TXT), entry_impl=CsdEntry, entry_sep='\\lx ') d.entries = list(filter(lambda r: r.get('lx'), d.entries))[1:] print(len(d.entries)) for i, v in enumerate(_LANGUAGES): l = data.add( models.Languoid, v[0], id=v[0], name=v[1], ord=i, color=v[4].lower(), proto=v[0].startswith('p') and len(v[0]) == 3, latitude=geocoords[v[2]][0], longitude=geocoords[v[2]][1], parent=data['Languoid'].get(v[5])) if v[2]: add_language_codes(data, l, v[2], glottocodes=glottocodes) if l.id == 'pn': l.latitude, l.longitude = (42.75, -98.03) if l.id == 'op': l.latitude, l.longitude = (43.5, -96.6) if l.id == 'mo': l.latitude, l.longitude = (40.05, -95.52) pnames = set() def _get(d, marker): _l = set(nfilter(d.get(marker, []))) if _l: _l = list(_l) if marker not in ['oo', 'or']: assert len(_l) == 1 _l = _l[0] return _l def add_counterpart(d, vs, id, phonetic, # forms cognate, # oo me, cm, so, org): assert phonetic or cognate if not cognate: if vs.language.proto: cognate = phonetic phonetic = None else: cognate = '[%s]' % phonetic m = models.Counterpart( id=id, name=cognate, phonetic=phonetic, description=me or '[%s]' % vs.parameter.name, comment=cm, original_entry=org, other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None, valueset=vs) if so: for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]): match = SID_PATTERN.match(sid) if not match: continue name = sid sid = normalize_sid(match.group('key')) source = data['Source'].get(sid) if not source: if sid in sources: s = sources[sid] source = data.add( common.Source, sid, id=sid, name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'], description=s.get('Title', s['citation']), author=s.get('Author'), title=s.get('Title'), year=s.get('Year'), ) else: source = data.add( common.Source, sid, id=sid, name=name.upper() if len(name) <= 3 else name) m.references.append(models.ValueReference( source=source, description=match.group('pages'))) for i, entry in enumerate(sorted(d.entries, key=lambda d: d.get('lx'), reverse=True)): lemma = entry.get('lx') if not lemma or not lemma.strip(): continue pname = lemma j = 1 while pname in pnames: pname = '%s (%s)' % (lemma, j) j += 1 pnames.add(pname) contrib = data.add( common.Contribution, pname, id=str(i + 1), name='Entry "%s"' % pname) meaning = data.add( models.Entry, pname, id=str(i + 1), name=pname, contribution=contrib, description=entry.get('com'), psi_reconstruction_with_root_extension_code=entry.get('lxcm'), sd=normalize_comma_separated(entry.get('sd'), SD, lower=True), ps=normalize_comma_separated(entry.get('ps'), PS), othlgs='\n---\n'.join(entry.getall('othlgs'))) if meaning.description: meaning.description = meaning.description.replace('.\n', '.\n\n') for lid, words in entry.get_words().items(): vsid = '%s-%s' % (lid, meaning.id) vs = data.add( common.ValueSet, vsid, id=vsid, parameter=meaning, contribution=contrib, language=data['Languoid'][lid]) for j, d in enumerate(words): looped = False for k, (oo, me, so, cm, org) in enumerate(izip_longest( *[d.get(_m, []) for _m in 'oo me so cm _org'.split()])): if not oo: continue looped = True add_counterpart(d, vs, '%s-%s-%s' % (vsid, j + 1, k + 1), d['forms'], oo, me, cm, so, org) if not looped: # not oo if not d['forms']: print '--->', d continue add_counterpart(d, vs, '%s-%s-%s' % (vsid, j + 1, 1), d['forms'], '; '.join(_get(d, 'oo') or []), _get(d, 'me'), _get(d, 'cm'), _get(d, 'so'), _get(d, '_org'))