def main(args): data = Data() dataset = common.Dataset( id=u'An Crúbadán', name=u'An Crúbadán', publisher_name="Saint Louis University", publisher_place="Saint Louis, USA", publisher_url="http://www.slu.edu/", description= "Linguistic datasets for over 2000 languages created from web-crawled text corpora", contact="*****@*****.**", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'https://licensebuttons.net/l/by/4.0/88x31.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='crubadan.org', ) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Kevin Scannell", id="Kevin Scannell", name="Kevin Scannell", email="*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() fillTable(DBSession)
def main(args): data = Data() # fetch language data from glottolog: glottolog = glottocodes_by_isocode( 'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude']) dataset = common.Dataset( id=jcld.__name__, name="Journal of Cross-Linguistic Databases", domain='jcld.clld.org') DBSession.add(dataset) contribution = data.add(common.Contribution, '1', id='1', name='fb') for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')): if row.Feature not in data['Parameter']: parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature) else: parameter = data['Parameter'][row.Feature] if row.Value not in data['DomainElement']: de = data.add( common.DomainElement, row.Value, id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value) else: de = data['DomainElement'][row.Value] if row.Language not in data['Language']: if row.Language not in glottolog: print '--->', row.Language continue glottocode, name, lat, lon = glottolog[row.Language] language = data.add( common.Language, row.Language, id=slug(row.Language), name=name, latitude=lat, longitude=lon) else: language = data['Language'][row.Language] id_ = str(i + 1) #'%s-%s' % (parameter.id, language.id) vs = common.ValueSet( id=id_, parameter=parameter, language=language, contribution=contribution, description=row.Comment, source=row.Source) common.Value(valueset=vs, name=row.Value, domainelement=de)
def test_add_language_codes(env): from clld.db.models.common import Language from clld.scripts.util import Data, add_language_codes add_language_codes(Data(), Language(), 'iso', glottocodes=dict(iso='glot1234'))
def main(args): data = Data() dataset = common.Dataset( id=cognition.__name__, name="COSTATOL", description="Cognitive Structures across the Tree of Life", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cognition.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # for rec in Database.from_file(args.data_file('sources.bib')): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) contrib = common.Contribution(id='costatol', name='COSTATOL') for datapoint in reader(args.data_file('values.csv'), delimiter=',', dicts=True): param = data['Parameter'].get(datapoint['cognitive capacity']) if not param: name = datapoint['cognitive capacity'] param = data.add(common.Parameter, name, id=slug(name), name=name) species = data['Language'].get(datapoint['species']) if not species: name = datapoint['species'] species = data.add(common.Language, name, id=slug(name), name=name) vid = '%s-%s' % (species.id, param.id) vs = data.add( common.ValueSet, vid, id=vid, language=species, parameter=param, contribution=contrib) data.add(common.Value, vid, id=vid, name=datapoint['value'], valueset=vs) match = source_pattern.match(datapoint['source']) if match: DBSession.add(common.ValueSetReference( valueset=vs, source=data['Source'][match.group('key')], description=match.group('pages'))) for species in reader(args.data_file('species.csv'), delimiter=',', namedtuples=True): data['Language'][species.name].longitude = species.longitude data['Language'][species.name].latitude = species.latitude
def main(args): data = Data() data.add( common.Dataset, "starling", id="starling", name="Starling", domain="starling.rinet.ru", description=" The Global Lexicostatistical Database", publisher_name="Russian State University for the Humanities, Moscow") data.add(common.Contribution, "starling", name="Starling", id="starling") def row_to_dict(row_entry): swadesh_id_idx, swadesh_word_idx, form_idx, cognation_idx, notes_idx = range( 0, 5) return { "swadesh_id": row_entry[swadesh_id_idx].value, "swadesh_word": row_entry[swadesh_word_idx].value, "form": row_entry[form_idx].value, "cognation_index": row_entry[cognation_idx].value, "notes": row_entry[notes_idx].value, } data_dir = "./gld/scripts/data/" for path in os.listdir(data_dir): data_file_path = os.path.join(data_dir, path) book = load_workbook(data_file_path) sheet = book.active lang_name = sheet["C1"].value data.add(common.Language, lang_name, id=lang_name, name=lang_name, latitude=52.0, longitude=0.0) fields = [ "swadesh_id", "swadesh_word", "form", "cognation_index", "notes" ] for row in sheet.iter_rows(min_row=2, min_col=1): row_data = row_to_dict(row) w = data.add(Word, "%s_%s" % (row_data["swadesh_id"], row_data["form"]), name=row_data["form"], description="Description", jsondata={k: row_data[k] for k in fields}) w.language = data["Language"][lang_name] DBSession.flush()
def main(args): #TODO explain etc diachronic_strength #sigtests of dependencies #isogloss-maps data = Data() dataset = common.Dataset( id=grambank.__name__, name="Grambank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) glottolog = Glottolog(GLOTTOLOG_REPOS) languoids = {l.id: l for l in glottolog.languoids()} import_gb20_features(GRAMBANK_REPOS, data) import_cldf(os.path.join(GRAMBANK_REPOS, 'datasets'), data, languoids) load_families( data, data['GrambankLanguage'].values(), glottolog=languoids, isolates_icon='tcccccc') # Add isolates for lg in data['GrambankLanguage'].values(): gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family return
def test_Data(mocker): from clld.db.models.common import Language from clld.scripts.util import Data session = set() mocker.patch('clld.scripts.util.DBSession', session) d = Data(jsondata={}) d.add(Language, 'l', id='l', name='l') assert session d.add(Language, 'l2', _obj=5) with pytest.raises(ValueError): d.add(Language, 'l3', id='l.3')
def main(args): data = Data() dataset = common.Dataset( id=cdk.__name__, name="CDK", description="Comprehensive Dictionary of Ket", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cdk.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='ket', name=dataset.name) DBSession.add(contrib) for i, (id, name) in enumerate([ ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'), ]): dataset.editors.append( common.Editor(contributor=common.Contributor(id=id, name=name), ord=i)) ket = data.add( common.Language, 'ket', id='ket', name='Ket', latitude=63.76, longitude=87.55) add_language_codes(data, ket, 'ket', glottocode='kett1243') for abbr, name in DIALECTS.items(): data.add(common.Language, abbr, id=abbr, name=name) with args.data_file('sources.txt').open(encoding='utf8') as fp: for i, chunk in enumerate(fp.read().split('\n\n\n')): try: id_, year, author, desc = chunk.split('\n') except: print(chunk) raise data.add( common.Source, id_, id=str(i + 1), name=id_, author=author, year=year, description=desc) with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader: load(data, reader, ket, contrib, verbs=False) with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader: load(data, reader, ket, contrib) print('parsing examples problematic in %s cases' % len(PROBLEMS))
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def main(args): data = Data() dataset = common.Dataset(id=moslex.__name__, domain='moslex.clld.org', name='MosLex', license='https://creativecommons.org' '/licenses/by-nc/4.0/', jsondata={ 'license_icon': 'cc-by-nc.png', 'license_name': 'Creative Commons ' 'Attribution-NC 4.0 ' 'International License' }) editor = data.add(common.Contributor, 'editor', id='editor', name='Alexei Kassian') common.Editor(dataset=dataset, contributor=editor) with open('languoids.json') as file: languoids = json.load(file) with open('concepts.json') as file: concepts = json.load(file) with open('forms.json') as file: forms = json.load(file) add_languoids(data, languoids) add_concepts(data, concepts) add_forms(data, forms) DBSession.add(dataset)
def main(args): data = Data() dataset = common.Dataset(id=moslex.__name__, domain='moslex.clld.org', name='MosLex', license='https://creativecommons.org' '/licenses/by-nc/4.0/', jsondata={ 'license_icon': 'cc-by-nc.png', 'license_name': 'Creative Commons ' 'Attribution-NC 4.0 ' 'International License' }) editor = data.add(common.Contributor, 'editor', id='editor', name='Alexei Kassian', email='*****@*****.**') common.Editor(dataset=dataset, contributor=editor) with open(os.environ['MOSLEX_CONCEPTS']) as file: concepts = json.load(file) add_concepts(data, concepts) data_folders = [ path for path in glob.glob(os.path.join(os.environ['MOSLEX_DATA'], '*')) if os.path.isdir(path) ] for folder in data_folders: add_data_folder(folder, data) DBSession.add(dataset)
def test_Data(self): from clld.db.models.common import Language from clld.scripts.util import Data session = set() with patch('clld.scripts.util.DBSession', session): d = Data(jsondata={}) d.add(Language, 'l', id='l', name='l') assert session d.add(Language, 'l2', _obj=5) self.assertRaises(ValueError, d.add, Language, 'l3', id='l.3')
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add( CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate[ 'Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def test_load_families(self): from clld_glottologfamily_plugin.util import load_families class Languoid(object): id = 'abcd1234' iso_code = 'abc' name = 'language' latitude = 1.0 longitude = 1.0 macroareas = ['Area'] @property def family(self): return self class Glottolog(object): def languoid(self, code): return Languoid() load_families(Data(), DBSession.query(LanguageWithFamily), glottolog=Glottolog())
def main(args): meta = parse_meta(args) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = {row[0]: row[0] for row in wals.execute('select id from genus')} with args.data_file('listss18.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'EH', 'CB']: DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add( models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) print(list(meta.keys()))
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="{0} {1}".format(glottolog.publication.web.name, version), publisher_name=glottolog.publication.publisher.name, publisher_place=glottolog.publication.publisher.place, publisher_url=glottolog.publication.publisher.url, license=glottolog.publication.license.url, domain=purl.URL(glottolog.publication.web.url).domain(), contact=glottolog.publication.web.contact, jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name}, ) data = Data() for e in glottolog.editors.values(): if e.current: ed = data.add(common.Contributor, e.id, id=e.id, name=e.name) common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord)) DBSession.add(dataset) contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog') DBSession.add(common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['hammarstroem'])) # # Add Parameters: # add = functools.partial(add_parameter, data) add('fc', name='Family classification') add('sc', name='Subclassification') add('aes', args.repos.aes_status.values(), name=args.repos.aes_status.__defaults__['name'], pkw=dict( jsondata=dict( reference_id=args.repos.aes_status.__defaults__['reference_id'], sources=[attr.asdict(v) for v in args.repos.aes_sources.values()], scale=[attr.asdict(v) for v in args.repos.aes_status.values()])), dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)), ) add('med', args.repos.med_types.values(), name='Most Extensive Description', dekw=lambda de: dict( name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)), ) add('macroarea', args.repos.macroareas.values(), pkw=dict( description=args.repos.macroareas.__defaults__['description'], jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])), dekw=lambda de: dict( name=de.name, description=de.description, jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)), ), ) add('ltype', args.repos.language_types.values(), name='Language Type', dekw=lambda de: dict(name=de.category, description=de.description), delookup='category', ) add('country', args.repos.countries, dekw=lambda de: dict(name=de.id, description=de.name), ) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) # # Now load languoid data, keeping track of relations that can only be inserted later. # lgsources = defaultdict(list) # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`: nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()]) lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()} for lang in nodemap.values(): for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(glottolog, data, lang, nodemap) for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: mas = [] for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma) mas.append(ma.name) ref.macroareas = ', '.join(mas)
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License'}) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby( sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add( models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add( common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add(models.ContributorReference( source=data['Source'][ref], contributor=contributor)) contrib = data.add( models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files( object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join( [t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet( id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add(common.ValueSetReference( source=data['Source'][ref], valueset=vs)) DBSession.add(common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add( common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add(common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): db = create_engine('sqlite:///' + args.data_file('sqlite3.db').resolve().as_posix()) data = Data() dataset = common.Dataset( id=tsezacp.__name__, name="The Tsez Annotated Corpus Project", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", contact='*****@*****.**', domain='tsezacp.clld.org', license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) # # TODO: add editors! # lang = data.add(common.Language, 'tsez', id='tsez', name='Tsez') for row in db.execute('select * from texts_data_text'): data.add( models.Text, row.id, id=str(row.Number), ord=row.Number, name=row.Title_in_Tsez, description=row.Title_in_English, russian=row.Title_in_Russian) for row in db.execute('select * from texts_data_line'): text = data['Text'][row.to_Text_id] data.add( models.Line, row.id, id='%s-%s' % (text.id, row.Line_Position), ord=row.Line_Position, language=lang, text=text, name=row.Tsez_Line, description=row.English_Translation, russian=row.Russian_Translation) for row in db.execute('select w.id, w.to_Line_id, w.Lex_Position, w.Word_in_Phrase, w.Word_Clear, m.id, m.Position, m.Value, m.Gloss, m.Part_of_Speech from texts_data_word as w, texts_data_morpheme as m where m.to_Word_id = w.id order by w.to_Line_id, w.Lex_Position, m.Position'): wid, lid, wpos, wname, wclear, mid, mpos, mname, mgloss, mpartofspeech = row if wid in data['WordInLine']: w = data['WordInLine'][wid] else: w = data.add( models.WordInLine, wid, id=str(wid), ord=wpos, name=wname, description=wclear, line=data['Line'][lid]) w.morphemes.append(models.MorphemeInWord( id=str(mid), ord=mpos, name=mname, description=mgloss, normgloss=mgloss[1:] if mgloss.startswith('-') else mgloss, pos=mpartofspeech.replace('-', '').strip())) for lid in sorted(data['Line'].keys()): line = data['Line'][lid] #print line.name #print ' '.join(w.name for w in line.words) line.analyzed = '\t'.join('\t'.join(m.name for m in w.morphemes) for w in line.words) line.gloss = '\t'.join('\t'.join(m.description for m in w.morphemes) for w in line.words) for row in db.execute('select * from texts_data_glossary'): data.add( models.Morpheme, row.id, id=str(row.id), name=row.Value, language=lang, description=row.Gloss, notes=row.Notes, pos=row.Part_of_Speech.replace('-', '').strip())
def main(args): data = Data() for rec in Database.from_file( data_path('references.bib'), lowercase=False): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id=clts.__name__, name="CLTS", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Thiago Chacon', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t', namedtuples=True)): if not i % 100: print('-', end="") key = line.NAME.replace(' ', '_') data.add( models.SoundSegment, key, id=key, name = line.NAME, grapheme=line.GRAPHEME, aliases=line.ALIASES, representation=len(line.REFLEXES.split(',')), reflexes = line.REFLEXES, generated = True if line.GENERATED else False, unicode = line.UNICODE, ) print('') english = data.add( common.Language, 'eng', id='eng', name='English') contributions = {} for line in reader(data_path('datasets.tsv'), delimiter='\t', namedtuples=True): contributions[line.NAME] = data.add( models.CLTSDataSet, line.NAME, id=line.NAME, name=line.NAME, description=line.DESCRIPTION, datatype=line.TYPE ) for id_ in line.REFS.split(', '): common.ContributionReference( source=data['Source'][id_], contribution=contributions[line.NAME]) visited = set() for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t", namedtuples=True)): if not i % 100: print('-', end='') key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME if key not in visited: sound_id = line.NAME.replace(' ', '_') vs = common.ValueSet( id=key, description=line.NAME, language=english, contribution=contributions[line.DATASET], parameter=data['SoundSegment'][sound_id] ) data.add( models.Grapheme, key, id=key, grapheme=line.GRAPHEME, bipa_grapheme=line.BIPA, name=line.NAME, dataset=line.DATASET, datatype=line.DATATYPE, frequency=line.FREQUENCY or 0, image=line.IMAGE, url=line.URL, valueset=vs ) visited.add(key) print('-')
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() def read(table): return list(dsv.reader( args.data_file(table + '.csv'), delimiter=',', namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", #published=date(2009, 8, 15), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-nc-nd/2.0/de/88x31.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany License', }, domain='ids.clld.org') DBSession.add(dataset) data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=l.lg_name) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso_codes = {l.id: l.sil_code for l in read('sil_lang')} languages = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil') if l.lg_id not in exclude} load_families(Data(), [(v, data['IdsLanguage'][k]) for k, v in languages.items()]) contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: if int(l.what_did_id) not in [4, 395]: print(l.what_did_id) raise ValueError sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="Max Planck Institute for Evolutionary Anthropology, Leipzig") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) for i, name in enumerate(sorted(sources.keys())): c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for lg in lgs: if lg in exclude: continue try: DBSession.add(common.LanguageSource( language_pk=data['IdsLanguage'][lg].pk, source_pk=data['Source'][name].pk)) except KeyError: print(name, lgs) continue altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = {'id': id_, 'name': name, 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() try: language = common.Language.get(data['IdsLanguage'][lg_id]) except KeyError: print(list(entries)) raise desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: data.add( models.Entry, entry_id, id=entry_id, name=entry_id, #active=False, sub_code=l.entry_id, chapter_pk=data['Chapter'][l.chap_id]) DBSession.flush() data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) #assert language.id == '238' # Rapa Nui has problems! trans2 = None for i, word in enumerate(trans1): v = models.Counterpart( id=id_ + '-' + str(i + 1 + len(vs.values)), name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: print('---', language.id, language.name) print(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned)
def main(args): # pragma: no cover wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json')) data = Data() data.add( common.Contributor, 'barthwolfgang', id='barthwolfgang', name="Wolfgang Barth", url="http://www.dynamicsoflanguage.edu.au/") # # FIXME: get dataset attributes from CLDF metadata! # dataset = common.Dataset( id='parabank', name='Parabank Pronouns', description='Database of pronouns', domain='parabank.clld.org', publisher_name="CoEDL Centre of Excellence for the Dynamics of Language", publisher_place="Canberra, Australia", publisher_url="http://www.dynamicsoflanguage.edu.au/", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0'}) DBSession.add(dataset) for i, editor in enumerate(['barthwolfgang']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for l in wl['LanguageTable']: lang = data.add( models.ParabankLanguage, l['ID'], id=l['ID'], name=l['Name'], description=l['Notes'], source=l['Source_Citation'], classification=l['Classification'], ) add_language_codes(data, lang, None, glottocode=l['Glottocode']) for p in wl['ParameterTable']: data.add( common.Parameter, p['ID'], id=p['ID'], name='{0} ({1})'.format(p['Name'], p['ID']), #description=p['Description'], ) for f in wl['FormTable']: vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=vsid, language=data['ParabankLanguage'][f['Language_ID']], parameter=data['Parameter'][f['Parameter_ID']], contribution=contrib) DBSession.add(models.Word( id=f['ID'], name=f['Form'], comment=f.get('Comment'), original=f['Original_parameter'], valueset=vs)) load_families( data, [(l.glottocode, l) for l in data['ParabankLanguage'].values()], glottolog_repos=args.data_file('glottolog'), isolates_icon='tcccccc')
def import_dataset(ds, contrib, languoids, conceptsets, sources, values): data = Data() concepts = {p.id: p for p in DBSession.query(Concept)} langs = {l.id: l for l in DBSession.query(LexibankLanguage)} for i, row in enumerate(ds.rows): if not row['Value'] or not row['Parameter_ID'] or not row['Language_ID']: continue lid = row['Language_ID'].lower() if lid == 'none': continue language = langs.get(lid) if language is None: languoid = languoids.get(lid) if not languoid: continue langs[lid] = language = LexibankLanguage( id=lid, name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude) concept = concepts.get(row['Parameter_ID']) if concept is None: cs = conceptsets[row['Parameter_ID']] concepts[row['Parameter_ID']] = concept = Concept( # FIXME: get gloss and description from concepticon! id=row['Parameter_ID'], name=cs['GLOSS'], description=cs['DEFINITION'], semanticfield=cs['SEMANTICFIELD']) vsid = unique_id(contrib, '%s-%s-%s' % (ds.name, language.id, concept.id)) vid = unique_id(contrib, row['ID']) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add( ValueSet, vsid, id=vsid, parameter=concept, language=language, contribution=contrib, source=None) # FIXME: add sources! counterpart = values.add( Counterpart, row['ID'], id=vid, valueset=vs, name=row['Value'], description=row.get('Comment'), context=row.get('Context'), variety_name=row.get('Language_name'), loan=row.get('Loan', False), ) for ref in row.refs: CounterpartReference( counterpart=counterpart, source=sources[ref.source.id], description=ref.description)
def main(args): old_db = create_engine(DB) data = Data() # # migrate contributor table: complete # for row in old_db.execute("select * from contributor"): data.add(common.Contributor, row['id'], id=row['id'], name='%(firstname)s %(lastname)s' % row, url=row['homepage'], description=row['note'], email=row['email'], address=row['address']) data.add(common.Contributor, 'haspelmathmartin', id='haspelmathmartin', name="Martin Haspelmath", url="http://email.eva.mpg.de/~haspelmt/") DBSession.flush() dataset = common.Dataset( id='wold', name='WOLD', description='World Loanword Database', domain='wold.clld.org', published=date(2009, 8, 15), license='http://creativecommons.org/licenses/by/3.0/de/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by/3.0/de/88x31.png', 'license_name': 'Creative Commons Attribution 3.0 Germany License' }) DBSession.add(dataset) for i, editor in enumerate(['haspelmathmartin', 'tadmoruri']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) # # migrate semantic_field table: complete # for row in old_db.execute("select * from semantic_field"): if row['id'] != 25: kw = dict((key, row[key]) for key in ['id', 'name', 'description']) data.add(models.SemanticField, row['id'], **kw) # # migrate language table: complete # recipient flag is replaced by vocabulary_pk! # for row in old_db.execute("select * from language order by id"): kw = dict((key, row[key]) for key in [ 'fm_dl_id', 'name', 'latitude', 'longitude', 'wals_equivalent', 'affiliation', 'family', 'genus', 'countries' ]) data.add(models.WoldLanguage, row['id'], id=str(row['id']), **kw) # # migrate language_code table: complete # for row in old_db.execute("select * from language_code"): _id = '%(type)s-%(code)s' % row data.add(common.Identifier, _id, id=_id, type=row['type'], name=row['code']) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add(common.Identifier, gc, id=gc, type=common.IdentifierType.glottolog.value, name=gc) DBSession.flush() # # migrate language_code_language table: complete # for row in old_db.execute("select * from language_code_language"): _id = '%(type)s-%(code)s' % row data.add(common.LanguageIdentifier, '%s-%s' % (_id, row['language_id']), identifier_pk=data['Identifier'][_id].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add(common.LanguageIdentifier, '%s-%s' % (gc, row['language_id']), identifier_pk=data['Identifier'][gc].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) DBSession.flush() # # migrate vocabulary table: complete # for row in old_db.execute("select * from vocabulary order by id"): jsondata = {} for key in row.keys(): if key.startswith('fd_') or key in [ 'other_information', 'abbreviations' ]: jsondata[key] = row[key] vocab = data.add(models.Vocabulary, row['id'], id=str(row['id']), name=row['name'], color=row['color'], jsondata=jsondata) DBSession.flush() data['WoldLanguage'][row['language_id']].vocabulary_pk = vocab.pk DBSession.flush() # # migrate contact_situation and age tables: complete # contact situations and ages are unitdomainelements! # contact_situation = common.UnitParameter(id='cs', name='Contact Situation') age = common.UnitParameter(id='a', name='Age') DBSession.add(contact_situation) DBSession.add(age) DBSession.flush() for row in old_db.execute("select * from contact_situation"): if row['vocabulary_id'] is None: continue kw = dict((key, row[key]) for key in ['description', 'id', 'name']) kw['id'] = 'cs-%s' % kw['id'] p = data.add(models.WoldUnitDomainElement, row['id'], **kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = contact_situation.pk for row in old_db.execute("select * from age"): id_ = '%(vocabulary_id)s-%(label)s' % row kw = dict((key, row[key]) for key in ['start_year', 'end_year']) p = data.add(models.WoldUnitDomainElement, id_, id='a-%s' % id_, name=row['label'], description=row['description'], jsondata=kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = age.pk # # migrate meaning table: complete # for row in old_db.execute("select * from meaning"): kw = dict((key, row[key]) for key in [ 'description', 'core_list', 'ids_code', 'typical_context', 'semantic_category' ]) p = data.add( models.Meaning, row['id'], id=row['id'].replace('.', '-'), name=row['label'], sub_code=row['id'].split('.')[1] if '.' in row['id'] else '', semantic_field=data['SemanticField'][row['semantic_field_id']], **kw) DBSession.flush() for field in ['french', 'spanish', 'german', 'russian']: DBSession.add( models.Translation(name=row[field], lang=field, meaning=p)) for key in data['WoldLanguage']: lang = data['WoldLanguage'][key] data.add(common.ValueSet, '%s-%s' % (key, row['id']), id='%s-%s' % (key, row['id'].replace('.', '-')), language=lang, contribution=lang.vocabulary, parameter=p) DBSession.flush() # # migrate word table: # TODO: all the other word properties!! # fields = [ 'age_label', 'original_script', 'grammatical_info', 'comment_on_word_form', 'gloss', "comment_on_borrowed", "calqued", "borrowed_base", "numeric_frequency", "relative_frequency", "effect", "integration", "salience", "reference", "other_comments", "register", "loan_history", 'colonial_word', 'paraphrase_in_dutch', 'word_source', 'paraphrase_in_german', 'lexical_stratum', 'comparison_with_mandarin', 'year', 'comparison_with_korean', 'czech_translation', 'hungarian_translation', 'early_romani_reconstruction', 'etymological_note', 'boretzky_and_igla_etymology', 'manuss_et_al_etymology', 'vekerdi_etymology', 'turner_etymology', 'other_etymologies', 'mayrhofer_etymology', ] word_to_vocab = {} for row in old_db.execute("select * from word"): word_to_vocab[row['id']] = row['vocabulary_id'] kw = dict((key, row[key]) for key in [ 'id', 'age_score', 'borrowed', 'borrowed_score', 'analyzability', 'simplicity_score' ]) w = data.add(models.Word, row['id'], name=row['form'], description=row['free_meaning'], jsondata={k: row[k] for k in fields}, **kw) w.language = data['Vocabulary'][row['vocabulary_id']].language if row['age_label']: DBSession.add( common.UnitValue( id='%(id)s-a' % row, unit=w, unitparameter=age, unitdomainelement=data['WoldUnitDomainElement'][ '%(vocabulary_id)s-%(age_label)s' % row], contribution=data['Vocabulary'][row['vocabulary_id']])) if row['contact_situation_id'] and row[ 'contact_situation_id'] != '9129144185487768': DBSession.add( common.UnitValue( id='%(id)s-cs' % row, unit=w, unitparameter=contact_situation, unitdomainelement=data['WoldUnitDomainElement'][ row['contact_situation_id']], contribution=data['Vocabulary'][row['vocabulary_id']])) DBSession.flush() # # migrate word_meaning table: complete # for i, row in enumerate(old_db.execute("select * from word_meaning")): data.add( models.Counterpart, i, id=i, description='%(relationship)s (%(comment_on_relationship)s)' % row, name=data['Word'][row['word_id']].name, valueset=data['ValueSet']['%s-%s' % (word_to_vocab[row['word_id']], row['meaning_id'])], word=data['Word'][row['word_id']]) DBSession.flush() # # migrate vocabulary_contributor table: complete # for row in old_db.execute("select * from vocabulary_contributor"): DBSession.add( common.ContributionContributor( ord=row['ordinal'], primary=row['primary'], contributor_pk=data['Contributor'][row['contributor_id']].pk, contribution_pk=data['Vocabulary'][row['vocabulary_id']].pk)) DBSession.flush() # # source words: we have to make sure a word does only belong to one language. # thus, we have to reassign identifier! # # loop over source_word, source_word_donor_language pairs keeping track of source_word ids: known_ids = {} for row in old_db.execute( "select sw.id, sw.meaning, sw.form, dl.language_id from source_word as sw, source_word_donor_language as dl where sw.id = dl.source_word_id" ): if row['id'] in known_ids: # source_word was already seen associated to a different donor language! assert row['language_id'] not in known_ids[row['id']] known_ids[row['id']].append(row['language_id']) id_ = '%s-%s' % (row['id'], len(known_ids[row['id']])) else: id_ = '%s-%s' % (row['id'], 1) known_ids[row['id']] = [row['language_id']] new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) new.language = data['WoldLanguage'][row['language_id']] # source words may end up as words without language! for row in old_db.execute( "select id, meaning, form from source_word where id not in (select source_word_id from source_word_donor_language)" ): id_ = '%s-%s' % (row['id'], 1) new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) DBSession.flush() # # migrate word_source_word relations # TODO: should be modelled as UnitParameter! # j = 0 for row in old_db.execute("select * from word_source_word"): # there may be more than one word associated with a source_word_id (see above) source_words = [] for i in range(4): # but we guess no more than 4 :) id_ = '%s-%s' % (row['source_word_id'], i + 1) if id_ in data['Word']: source_words.append(data['Word'][id_]) if not source_words: j += 1 #print(row['source_word_id']) #raise ValueError(row['source_word_id']) for sw in source_words: DBSession.add( models.Loan(source_word=sw, target_word=data['Word'][row['word_id']], relation=row['relationship'], certain=len(source_words) == 1)) print('%s source words not migrated because they have no donor language!' % j)
def main(args): data = Data() dataset = common.Dataset( id=abvd.__name__, name='ABVD', description='', domain='abvd.clld.org', published=date.today(), license='https://creativecommons.org/licenses/by/4.0/', contact='', jsondata={ 'doi': args.doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for name in ['Simon Greenhill', 'Robert Blust', 'Russell Gray']: common.Editor(contributor=contributor(data, name), dataset=dataset) cnames = Counter() families = Counter([l['Family'] for l in args.cldf['LanguageTable']]) colors = dict( zip([i[0] for i in families.most_common()], color.qualitative_colors(len(families)))) cid2l = {} for lang in args.cldf['LanguageTable']: lid = (lang['Name'], lang['Glottocode']) l = data['Language'].get(lid) if not l: l = data.add( common.Language, lid, id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], jsondata=dict( family=lang['Family'], icon='{0}{1}'.format('c' if lang['Family'] else 't', colors[lang['Family']]), ), ) if lang['Glottocode'] or lang['ISO639P3code']: add_language_codes( data, l, isocode=lang['ISO639P3code'], glottocode=lang['Glottocode']) cid2l[lang['ID']] = l cname = '{0} ({1})'.format(lang['Name'], lang['author']) cnames.update([cname]) if cnames[cname] > 1: cname += ' {0}'.format(cnames[cname]) c = data.add( models.Wordlist, lang['ID'], id=lang['ID'], name=cname, description=lang['author'], language=l, notes=lang['notes'], ) i = 0 typers = (lang['typedby'] or '').split(' and ') checkers = (lang['checkedby'] or '').split(' and ') for name in typers: i += 1 DBSession.add(common.ContributionContributor( contribution=c, contributor=contributor(data, name), ord=i, jsondata=dict(type='typedby and checkedby' if name in checkers else 'typedby'), )) for name in checkers: if name in typers: continue i += 1 DBSession.add(common.ContributionContributor( contribution=c, contributor=contributor(data, name), ord=i, jsondata=dict(type='checkedby'), )) for param in args.cldf['ParameterTable']: data.add( common.Parameter, param['ID'], id=param['ID'], name=param['Name'], ) # # FIXME: add sources! # vsrs = set() for row in args.cldf['FormTable']: vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID'])) if not vs: vs = data.add( common.ValueSet, (row['Language_ID'], row['Parameter_ID']), id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']), language=cid2l[row['Language_ID']], parameter=data['Parameter'][row['Parameter_ID']], contribution=data['Wordlist'][row['Language_ID']], ) v = data.add( common.Value, row['ID'], id=row['ID'], name=row['Form'], valueset=vs ) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add(Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID']) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Value'][row['Form_ID']], doubt=row['Doubt'], )
def main(args): meta = parse_meta(args) print(len(meta)) print(sum(len(m.sources) for m in meta.values())) sources = {} for m in meta.values(): for s in m.sources: sources[s] = None print(len(sources), 'distinct') for i, s in enumerate(sources): sources[s] = get_source(s, i + 1) glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3') data = Data() wals = create_engine('postgresql://robert@/wals3') wals_families = {} for row in wals.execute('select name, id from family'): wals_families[row[0]] = row[1] wals_families[row[1]] = row[1] #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'): # name = item.FAMILY # if name not in wals_families: # name = slug(name) # if name not in wals_families: # print('missing wals family:', item.FAMILY) # name = None # if name: # wals_families[item.ABBREVIATION] = wals_families[name] wals_genera = { row[0]: row[0] for row in wals.execute('select id from genus') } with args.data_file('listss17.txt').open(encoding='latin1') as fp: wordlists = ['\n'.join(lines) for lines in parse(fp)] dataset = common.Dataset( id=asjp.__name__, name="The ASJP Database", contact="*****@*****.**", description="The Automated Similarity Judgment Program", domain='asjp.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) transcribers = get_transcriber_map(args) for i, spec in enumerate([ ('SW', "Søren Wichmann"), ('AM', "André Müller"), ('AKW', "Annkathrin Wett"), ('VV', "Viveka Velupillai"), ('JB', "Julia Bischoffberger"), ('CB', "Cecil H. Brown"), ('EH', "Eric W. Holman"), ('SS', "Sebastian Sauppe"), ('ZM', "Zarina Molochieva"), ('PB', "Pamela Brown"), ('HH', "Harald Hammarström"), ('OB', "Oleg Belyaev"), ('JML', "Johann-Mattis List"), ('DBA', "Dik Bakker"), ('DE', "Dmitry Egorov"), ('MU', "Matthias Urban"), ('RM', "Robert Mailhammer"), ('AC', "Agustina Carrizo"), ('MSD', "Matthew S. Dryer"), ('EK', "Evgenia Korovina"), ('DB', "David Beck"), ('HG', "Helen Geyer"), ('PE', "Patience Epps"), ('AG', "Anthony Grant"), ('PS', "Paul Sidwell"), # not in citation ('KTR', "K. Taraka Rama"), # not in citation ('PV', "Pilar Valenzuela"), ('MD', "Mark Donohue"), # not in citation ]): id_, name = spec if id_ in transcribers: assert name == transcribers.pop(id_) contributor = data.add(common.Contributor, id_, id=id_, name=name) if id_ in ['SW', 'CB', 'EH']: DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for id_, name in transcribers.items(): data.add(common.Contributor, id_, id=id_, name=name) for id_ in sorted(models.MEANINGS_ALL.keys()): data.add(models.Meaning, id_, id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS) for n, l in enumerate(wordlists): #if n > 100: # break lang = models.Doculect.from_txt(l) if lang.classification_wals: family, genus = lang.classification_wals.split('.') lang.wals_family = wals_families.get(family) lang.wals_genus = wals_genera.get(slug(genus)) lang.code_glottolog = glottocodes.get(lang.code_iso) add_codes(lang) data.add(models.Doculect, lang.id, _obj=lang) DBSession.flush() md = meta.pop(lang.id, None) assert md # associate transcribers and sources for i, transcriber in enumerate(md.transcribers): common.ContributionContributor( contribution=lang.wordlist, contributor=data['Contributor'][transcriber], ord=i + 1) for source in md.sources: DBSession.add( common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk)) assert not list(meta.keys())
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) data = Data() concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310') def concepticon_id(ids_code): for item in concept_list: if item['IDS_ID'] == ids_code: return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True)) dataset = common.Dataset( id=ids.__name__, name="IDS", description="The Intercontinental Dictionary Series", published=date(2015, 5, 25), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License', }, domain='ids.clld.org') DBSession.add(dataset) for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True): if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() data_desc = defaultdict(dict) for l in read('x_lg_data'): data_desc[l.lg_id][l.map_ids_data] = l.header # language lang iso_codes = {l.id: l.sil_code for l in read('sil_lang')} iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')} languages = [] exclude = [] for l in read('lang'): if l.status == '1': exclude.append(l.lg_id) continue lang_changed = LANGS.get(int(l.lg_id), {}) code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id) lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name)) if code: languages.append((code, lang)) data.add( models.Dictionary, l.lg_id, id=l.lg_id, name=l.lg_name, language=lang, default_representation=data_desc[l.lg_id].get('1'), alt_representation=data_desc[l.lg_id].get('2'), jsondata=dict(status=l.status, date=l.date)) iso2glotto = {} for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)): if l.iso: iso2glotto[l.iso] = l.id load_families( Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc') contributors = defaultdict(list) sources = defaultdict(list) for l in read('lang_compilers'): if l.lg_id in exclude: continue if l.name == "BIBIKO": continue #name lg_id what_did_id if int(l.what_did_id) in models.ROLES: contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id)) else: assert int(l.what_did_id) in [4, 395] sources[l.name].append(l.lg_id) for s, roles in contributors.items(): name = roles[0][0] c = data.add(common.Contributor, s, id=s, name=name) if name == 'Mary Ritchie Key': c.address = 'University of California, Irvine' for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]): sroles = sorted( [s[1] for s in specs], reverse=True, key=lambda what: what + 2 if what == 2 else what) what = sroles[0] DBSession.add(common.ContributionContributor( contribution=data['Dictionary'][lg], contributor=c, ord=what, primary=what == 2)) data.add( common.Contributor, 'bernardcomrie', id='bernardcomrie', name="Bernard Comrie", address="University of California, Santa Barbara") for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) #for i, name in enumerate(sorted(sources.keys())): # c = data.add(common.Source, name, id=str(i + 1), name=name, description=name) DBSession.flush() for name, lgs in sources.items(): for _src in name.split(';'): src = data['Source'].get(_src.strip()) if not src: print('-- missing source --', _src) raise ValueError for lg in lgs: if lg in exclude: continue assert lg in data['Dictionary'] DBSession.add(common.ContributionReference( contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk)) altnames = {} for i, l in enumerate(read('alt_names')): if l.name in altnames: identifier = altnames[l.name] else: identifier = data.add( common.Identifier, l.name, id='name-%s' % i, type='name', name=l.name, description='IDS') altnames[l.name] = identifier if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name: DBSession.add(common.LanguageIdentifier( identifier=identifier, language=data['IdsLanguage'][l.lg_id])) # parameter chapter/entry for l in read('chapter'): data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title) entries = {} for l in read('entry'): id_ = '%s-%s' % (l.chap_id, l.entry_id) name = l.trans_english if name in entries: entries[name] += 1 name = name + ' (%s)' % entries[name] else: entries[name] = 1 kw = { 'id': id_, 'name': name, 'concepticon_id': concepticon_id(id_), 'chapter': data['Chapter'][l.chap_id]} for ll in 'french russian spanish portugese'.split(): kw[ll] = getattr(l, 'trans_' + ll) data.add(models.Entry, id_, sub_code=l.entry_id, **kw) misaligned = [] DBSession.flush() for entity in 'IdsLanguage Entry Chapter Dictionary'.split(): for k in data[entity].keys()[:]: data[entity][k] = data[entity][k].pk synsets = set() counterparts = set() problems = defaultdict(list) for lg_id, entries in groupby( sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id): if lg_id in exclude or not lg_id: continue # keep the memory footprint reasonable transaction.commit() transaction.begin() language = common.Language.get(data['IdsLanguage'][lg_id]) desc = data_desc.get(lg_id, {}) words = defaultdict(list) for l in entries: if empty.match(l.data_1): continue entry_id = '%s-%s' % (l.chap_id, l.entry_id) if entry_id not in data['Entry']: continue #data.add( # models.Entry, entry_id, # id=entry_id, # name=entry_id, # concepticon_id=concepticon_id(entry_id), # sub_code=l.entry_id, # chapter_pk=data['Chapter'][l.chap_id]) #DBSession.flush() #data['Entry'][entry_id] = data['Entry'][entry_id].pk id_ = '%s-%s' % (entry_id, l.lg_id) if id_ in synsets: vs = models.Synset.get(id_) else: vs = models.Synset( id=id_, comment=get_string(l.comment or ''), alt_representation=get_string(l.data_2), language=language, contribution_pk=data['Dictionary'][l.lg_id], parameter_pk=data['Entry'][entry_id]) synsets.add(id_) trans1 = list(split_counterparts(l.data_1)) trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2)) if trans2: if len(trans2) != len(trans1): if language.id != '238': misaligned.append((l.chap_id, l.entry_id, l.lg_id)) #print('===', language.id, language.name) #print(l.data_1) #print(l.data_2) # 83 cases of misaligned transcriptions trans2 = None for i, word in enumerate(trans1): cid = id_ + '-' + str(i + 1 + len(vs.values)) if cid not in counterparts: v = models.Counterpart( id=cid, name=word, description=desc.get('1'), valueset=vs) words[word].append((v, trans2[i] if trans2 else None)) counterparts.add(cid) else: print(cid) #12 - 420 - 811 - 3 #5 - 390 - 818 - 3 #2 - 930 - 819 - 3 #2 - 930 - 819 - 3 #3 - 120 - 819 - 3 #10 - 140 - 822 - 3 #9 - 160 - 825 - 3 #2 - 430 - 829 - 4 for i, form in enumerate(words.keys()): # Since we identify words based on their string representation, we have to # make sure a word has the same alternative transcription for all meanings. if language.id == '238': alt_names = [] else: alt_names = set(norm(w[1] or '', desc.get('2'), language.id) for w in words[form]) alt_names = nfilter(alt_names) try: assert len(alt_names) <= 1 except AssertionError: problems[(language.id, language.name)].append(alt_names) word = models.Word( id='%s-%s' % (language.id, i + 1), name=form, description=desc.get('1'), language=language, alt_name=', '.join(alt_names) if alt_names else None, alt_description=desc.get('2') ) for v, _ in words[form]: word.counterparts.append(v) DBSession.add(word) DBSession.flush() with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp: fp.writerows(misaligned) # about 250 cases where alternative transcriotions do not covary across meanings. for k, v in problems.items(): print(k, len(v))
def main(args): data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2015, 10, 1), contact='*****@*****.**', domain='dictionaria.clld.org', license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) ed = data.add( common.Contributor, 'hartmanniren', id='hartmanniren', name='Iren Hartmann') common.Editor(dataset=dataset, contributor=ed) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} comparison_meanings_alt_labels = {} print('loading concepts ...') concepticon = Concepticon() for i, concept_set in enumerate(concepticon.resources('parameter').members): concept_set = concepticon.resource(concept_set) cm = ComparisonMeaning( id=concept_set.id, name=concept_set.name.lower(), description=concept_set.description, concepticon_url='%s' % concept_set.uriref) DBSession.add(cm) comparison_meanings[cm.name] = cm for label in concept_set.alt_labels: comparison_meanings_alt_labels.setdefault(label.lower(), cm) DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} comparison_meanings_alt_labels = { k: v.pk for k, v in comparison_meanings_alt_labels.items()} submissions = [] for submission in REPOS.joinpath('submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md id_ = submission.id lmd = md['language'] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) dictionary = data.add( Dictionary, id_, id=id_, name=lmd['name'] + ' Dictionary', language=language, published=date(*map(int, md['published'].split('-')))) for i, cname in enumerate(md['authors']): name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add(common.Contributor, cid, id=cid, name=cname) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: try: mod = __import__( 'dictionaria.loader.' + submission.id, fromlist=['MARKER_MAP']) marker_map = mod.MARKER_MAP except ImportError: marker_map = {} transaction.begin() print('loading %s ...' % submission.id) submission.load( did, lid, comparison_meanings, comparison_meanings_alt_labels, marker_map) transaction.commit() print('... done') #('hoocak', 'Hooca\u0328k', 43.5, -88.5, [('hartmanniren', 'Iren Hartmann')]), #('yakkha', 'Yakkha', 27.37, 87.93, [('schackowdiana', 'Diana Schackow')]), #('palula', 'Palula', 35.51, 71.84, [('liljegrenhenrik', 'Henrik Liljegren')], {}), #('daakaka', 'Daakaka', -16.27, 168.01, [('vonprincekilu', 'Kilu von Prince')], # {'published': date(2015, 9, 30), 'iso': 'bpa', 'glottocode': 'daka1243'}), #('teop', 'Teop', -5.67, 154.97, [('moselulrike', 'Ulrike Mosel')], # {'published': date(2015, 9, 30), 'iso': 'tio', 'glottocode': 'teop1238', 'encoding': 'latin1'}), transaction.begin() load_families(Data(), DBSession.query(Variety))
def import_dataset(ds, contrib, languoids, conceptsets, sources, values): data = Data() concepts = {p.id: p for p in DBSession.query(Concept)} langs = {l.id: l for l in DBSession.query(LexiRumahLanguage)} for i, row in enumerate(ds.rows): if not row['Value'] or not row['Parameter_ID'] or not row[ 'Language_ID']: continue lid = row['Language_ID'].lower() if lid == 'none': continue if not row['Parameter_ID'].strip(): continue language = langs.get(lid) if language is None: languoid = languoids.get(lid) if not languoid: continue langs[lid] = language = LexiRumahLanguage( id=lid, name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude if languoid.id != 'plau1238' else -10, longitude=languoid.longitude) concept = concepts.get(row['Parameter_ID']) if concept is None: cs = conceptsets[row['Parameter_ID']] concepts[row['Parameter_ID']] = concept = Concept( id=row['Parameter_ID'], name=cs.gloss, description=cs.definition, semanticfield=cs.semanticfield) vsid = unique_id(contrib, '%s-%s-%s' % (ds.name, language.id, concept.id)) vid = unique_id(contrib, row['ID']) vs = data['ValueSet'].get(vsid) if vs is None: vs = data.add(ValueSet, vsid, id=vsid, parameter=concept, language=language, contribution=contrib, source=None) # FIXME: add sources! counterpart = values.add( Counterpart, row['ID'], id=vid, valueset=vs, name=row['Form'], description=row.get('Comment'), context=row['Value'], variety_name=row.get('Language_name'), loan=row.get('Loan', False), ) for ref in row.refs: CounterpartReference(counterpart=counterpart, source=sources[ref.source.id], description=ref.description)
def main(args): data = Data() files_dir.rmtree() files_dir.mkdir() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read('People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 8, 15), # # TODO: switch license! # license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict((row['ID'], row['RGB_code']) for row in read('Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 with open(data_dir.joinpath('non-lgr-gloss-abbrs.csv'), 'rb') as csvfile: for row in csv.reader(csvfile): for match in GLOSS_ABBR_PATTERN.finditer(row[1]): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row[0])) non_bibs = {} for row in read('References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add( common.Source, row['Reference_ID'], id=row['Reference_ID'], name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() gt = {} p = re.compile('[0-9]+\_(?P<name>[^\_]+)\_(GT|Text)') for d in data_dir.joinpath('gt').files(): m = p.search(unicode(d.basename())) if m: for part in m.group('name').split('&'): # make sure we prefer files named "Text_for_soundfile" if slug(unicode(part)) not in gt or 'Text_for_' in d.basename(): gt[slug(unicode(part))] = d gt_audio = {} p = re.compile('(?P<name>[^\.]+)\.mp3') for d in data_dir.joinpath('gt', 'audio').files(): m = p.search(unicode(d.basename())) assert m for part in m.group('name').split('&'): gt_audio[slug(unicode(part))] = d with open(args.data_file('infobox.json')) as fp: infobox = json.load(fp) for row in read('Languages', 'Order_number'): lon, lat = [float(c.strip()) for c in row['map_coordinates'].split(',')] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], #base_language=row['Category_base_language'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add(common.Language_data( object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] == "Checked": desc = row.get('Languages_contribution_documentation::Lect description', '') else: desc = '' c = data.add( models.ApicsContribution, row['Language_ID'], id=row['Order_number'], name=row['Language_name'], description=desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) if slug(row['Language_name']) in gt: f = common.Contribution_files( object=c, id='%s-gt.pdf' % c.id, name='Glossed text', mime_type='application/pdf') f.create(files_dir, file(gt[slug(row['Language_name'])]).read()) else: print '--- no glossed text for:', row['Language_name'] if slug(row['Language_name']) in gt_audio: f = common.Contribution_files( object=c, id='%s-gt.mp3' % c.id, name='Glossed text audio', mime_type='audio/mpeg') f.create(files_dir, file(gt_audio[slug(row['Language_name'])]).read()) else: print '--- no audio for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add( common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type='iso639-3') DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add( common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add(common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][row['Language_name_ethnologue']])) example_count = {} soundfiles = {} for p in data_dir.joinpath('Soundfiles_Examples').files(): soundfiles[p.namebase] = p for row in read('Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max([example_count.get(row['Language_ID'], 1), row['Example_number']]) p = data.add( common.Sentence, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={'sort': row['Order_number']}, language=lang) if id_ in soundfiles: #print '---> sound', id_ f = common.Sentence_files( object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg') f.create(files_dir, file(soundfiles[id_]).read()) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'], )) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read('Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add(common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read('Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add( models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=row['Feature_annotation_publication'], markup_description=normalize_markup(row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append(models.FeatureAuthor( ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict(zip( primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read('Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]] = str(feature_count) p = data.add( models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ( 'Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add( common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read('Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add( models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 7): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row['Value%s' % i] and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: name = '%s - %s' % (row['Sociolinguistic_feature_name'], i) kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) de = data.add( common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={'color': colors.values()[i]}) sd = {} soundfiles = {} for p in data_dir.joinpath('Soundfiles_Segments').files(): soundfiles[p.namebase] = p for row in read('Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] #Language_ID,Segment_feature_number,Comments,Audio_file_name,Example_word, #Example_word_gloss,Presence_in_the_language,Refers_to_references_Reference_ID if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement']['%s-%s' % ( number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = data.add( common.Sentence, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) sid = '%(Language_ID)s-%(Segment_feature_number)s' % row if sid in soundfiles: print '---> sound', sid f = common.Sentence_files( object=p, id='%s.mp3' % p.id, name='Audio', mime_type='audio/mpeg') f.create(files_dir, file(soundfiles[sid]).read()) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add(common.ValueSetReference( valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read('wals'): if row['z_calc_WALS_value_number']: wals_value_number[row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr, num_values in [ ('', '', 10), ('Sociolinguistic', 'sl', 7), ]: for row in read(prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print 'no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)] continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] #if row[prefix('feature_code', _prefix)] not in data['Feature']: # print row[prefix('feature_code', _prefix)] # print str(row[prefix('data_record_id', _prefix)]) # raise ValueError language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup(row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), #valueset=valueset, domainelement=data['DomainElement']['%s-%s' % ( row[prefix('feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = {'wals_value_number': wals_value_number.pop(row[prefix('data_record_id', _prefix)])} valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int(parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add(common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add(common.ValueSetReference( valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add(common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: print('Reference for unknown dataset: %s' % row[prefix + 'ata_record_id']) continue DBSession.flush() missing = 0 for row in read('Value_examples'): try: DBSession.add(common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence']['%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read('Contributors')): kw = dict( contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']] ) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def main(args): data = Data(created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() #print icons DBSession.execute("delete from Language") DBSession.execute("delete from Unit") DBSession.execute("delete from featuredomain") DBSession.execute("delete from family") DBSession.execute("delete from source") DBSession.execute("delete from parameter") DBSession.execute("delete from feature") DBSession.execute("delete from domainelement") DBSession.execute("delete from valueset") DBSession.execute("delete from value") DBSession.execute("delete from lsivalue") DBSession.execute("delete from dataset") DBSession.execute("delete from contributor") DBSession.execute("delete from lsilanguage") DBSession.execute("delete from contribution") DBSession.execute("delete from designer") DBSession.flush() dtab = partial(_dtab, args.data_file()) #Languages #print args.data_file() #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')] #tabfns = ['nts_18.tab'] ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:] tabfns = os.listdir( '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data' )[1:] #print tabfns args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[ 'language_id'] == '': # to exclude languages which do not have an iso-code continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) ##print ld lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) #lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add(models.lsiLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() #print "I am here" #print data['ntsLanguage'].values()[1].id load_families( data, ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()], [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values() if l.id != '---' and l.id != ''], isolates_icon='tcccccc') #print 'family' #print data['Family'].get('sino1245').jsondata #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): for i, info in enumerate([{ 'designer': 'shafqat', 'domain': '', 'pdflink': '', 'citation': '' }, { 'designer': '-', 'domain': '', 'pdflink': '', 'citation': '' }]): designer_id = str(i + 1) data.add(models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources '''for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush()''' #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [ (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values") ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for _, dfsids in groupby(sorted( (f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids)) assert len(list(dfsids)) == 1 #print 'here is nlgs' for fid, f in fs: #print "lang name" #print ldps #print f.get('feature_possible_values', ""), if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get( "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])} ##vicons['?'] = 'c00ffff' ##vicons['N/A'] = 'c00ffff' ##vicons = icons.iconize(vi.keys()) for (v, desc) in vdesc.items(): #print v,vicons[v] data.add(common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": Colors[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn("Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() glottolog = Glottolog() for ld in ldps: ############################### for printing different map markers for different familys for features:shafqat #print data['Family'] language = data['lsiLanguage'][ld['language_id']] if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) ##ld['value'] = ld['value']+'-'+str(family) ##ld['value'] = combineValueFamily(ld['value'],str(family)) #print family ##################################### parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = (ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error( msg.format( sorted([ y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid'] ]))) ##print msg.format(sorted( ## [y for (x, y) in data['DomainElement'].keys() ## if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, ##contribution=parameter.designer ) #print #print "this one" #print ld['value'],family models.lsiValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={ "icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata, "family": FamilyCodes[str(family)] }, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) '''if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k])''' DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['lsiLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "lsi.cldf") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="LSI", name='Linguistic Survey of India', publisher_name="Sprakbanken", publisher_place="Gothenburg", publisher_url="to be given", description="this is to be followed", domain='http://lsi.clld.org', published=date(2016, 05, 16), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany' }) # disabled for experimental purposes, names were appearing multiple times for i, contributor in enumerate([ common.Contributor(id="Lars Borin", name="Lars Borin", email="*****@*****.**"), common.Contributor(id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**"), common.Contributor(id="Anju Saxena", name="Anju Saxena", email="*****@*****.**"), common.Contributor(id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") ]): #print i common.Editor(dataset=dataset, contributor=contributor, ord=i) '''cont1 = common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**") cont2= common.Contributor( id="Shafqat Mumtaz Virk", name="Shafqat Mumtaz Virk", email="*****@*****.**") cont3 = common.Contributor( id="Lars Borin", name="Lars Borin", email="*****@*****.**") for contributor in [cont1,cont2,cont3]: common.Editor(dataset=dataset, contributor=contributor,ord=1)''' DBSession.add(dataset) DBSession.flush()
def create(args): args.log.info('starting migration ...') data = Data() db = create_engine('postgresql://robert@/glottolog2') with transaction.manager: sn = data.add(common.Contributor, 'sn', id='sn', name='Sebastian Nordhoff') hh = data.add(common.Contributor, 'hh', id='hh', name='Harald Hammarström') rf = data.add(common.Contributor, 'rf', id='rf', name='Robert Forkel', url="https://github.com/xrotwang") mh = data.add(common.Contributor, 'mh', id='mh', name='Martin Haspelmath') contrib = data.add(common.Contribution, 'c', id='classification', name='Classification') data.add(common.ContributionContributor, 'hh', contribution=contrib, contributor=hh) params = dict( fc=data.add(common.Parameter, 'fc', id='fc', name='Family classification'), sc=data.add(common.Parameter, 'sc', id='sc', name='Subclassification'), ) dataset = data.add( common.Dataset, 'd', id='glottolog', name='Glottolog 2.0', description='', published=datetime.date(2013, 8, 15), domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, ed in enumerate([sn, hh, rf, mh]): DBSession.add( common.Editor(dataset=dataset, contributor=ed, ord=i + 1)) valuesets = {} def create_languoid(row, father_pk=None): glottocode = { 'akun1242': 'akun1241' }.get(row['alnumcode'], row['alnumcode']) attrs = dict( pk=row['id'], id=glottocode, name=row['primaryname'], description=row['globalclassificationcomment'], level=getattr(models2.LanguoidLevel, row['level']), status=getattr(models2.LanguoidStatus, (row['status'] or '').replace(' ', '_'), None), father_pk=father_pk, created=row['updated'], jsondata={} if not row['hname'] else {'hname': row['hname']}, ) for attr in [ 'active', 'updated', 'hid', 'latitude', 'longitude', ]: attrs[attr] = row[attr] l = data.add(models2.Languoid, row['id'], **attrs) for type_ in params: id_ = '%s%s' % (type_, row['id']) vs = data.add(common.ValueSet, id_, id=id_, description=row['classificationcomment'] if type_ == 'fc' else row['subclassificationcomment'], language=l, parameter=params[type_], contribution=contrib) data.add(common.Value, id_, id=id_, name='%s - %s' % (row['level'], row['status']), valueset=vs) DBSession.flush() valuesets[id_] = vs.pk return str(row['id']) level = 0 parents = [ create_languoid(row) for row in db.execute( 'select * from languoidbase where father_id is null') ] while parents: args.log.info('level: %s' % level) level += 1 parents = [ create_languoid( row, father_pk=data['Languoid'][row['father_id']].pk) for row in db.execute( 'select * from languoidbase where father_id in (%s)' % ','.join(parents)) ] def handler(offset, batch): svalues = [] rvalues = [] for row in batch: jsondata = json.loads(row['jsondata'] or "{}") jsondata['bibtexkey'] = row['bibtexkey'] dicts = { 's': dict(pk=row['id'], polymorphic_type='base', id=str(row['id']), name='%(author)s %(year)s' % row, description=row['title'], bibtex_type=getattr(EntryType, row['type']), jsondata=jsondata), 'r': dict(pk=row['id']), } for model, map_ in { 's': { 'author': None, 'yearstring': 'year', 'year': 'year_int', 'startpage': 'startpage_int', 'numberofpages': 'pages_int', 'pages': None, 'edition': None, 'school': None, 'address': None, 'url': None, 'note': None, 'number': None, 'series': None, 'editor': None, 'booktitle': None, 'journal': None, 'volume': None, 'publisher': None, }, 'r': { 'endpage': 'endpage_int', 'inlg': None, 'inlg_code': None, 'subject': None, 'subject_headings': None, 'keywords': None, 'normalizedauthorstring': None, 'normalizededitorstring': None, 'ozbib_id': None, } }.items(): for okey, nkey in map_.items(): dicts[model][nkey or okey] = row[okey] svalues.append(dicts['s']) rvalues.append(dicts['r']) DBSession.execute(common.Source.__table__.insert(), svalues) DBSession.execute(models2.Ref.__table__.insert(), rvalues) select(db, 'select * from refbase order by id', handler) DBSession.execute('COMMIT') for table, model, value, order in [ ('macroarea', models2.Macroarea, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['name'], description=row['description']), None), ('country', models2.Country, lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']), None), ('provider', models2.Provider, lambda i, row: dict(pk=row['id'], id=slug(row['name']), name=row['description'], description=row['comment'], abbr=row['abbr'], url=row['url'], refurl=row['refurl'], bibfield=row['bibfield']), None), ('doctype', models2.Doctype, lambda i, row: dict(pk=row['id'], id=slug(row['name']), abbr=row['abbr'], name=row['name'], description=row['description']), None), ('refprovider', models2.Refprovider, lambda i, row: dict( pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']), ('provider_id', 'refbase_id')), ('refdoctype', models2.Refdoctype, lambda i, row: dict( pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']), ('doctype_id', 'refbase_id')), ]: insert(db, table, model, value, order=order) names = dict( (int(d['id']), d['pk']) for d in insert(db, 'namebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['namestring'], type='name', description=row['nameprovider'], lang=row['inlg'] if row['inlg'] and len(row['inlg']) <= 3 else 'en'), order='id')) codes = dict( (int(d['id']), d['pk']) for d in insert(db, 'codebase', common.Identifier, lambda i, row: dict(pk=i, id=str(row['id']), name=row['codestring'], type=common.IdentifierType.iso. value if row['codeprovider'] == 'ISO' else row['codeprovider']), start=len(names), order='id')) res = insert( db, 'nodecodes', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=codes[row['codebase_id']])) insert(db, 'nodenames', common.LanguageIdentifier, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], identifier_pk=names[row['namebase_id']]), start=len(res)) for table, model, value in [ ('languoidmacroarea', models2.Languoidmacroarea, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], macroarea_pk=row['macroarea_id'])), ('languoidcountry', models2.Languoidcountry, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], country_pk=row['country_id'])), ('noderefs', common.LanguageSource, lambda i, row: dict(pk=i, language_pk=row['languoidbase_id'], source_pk=row['refbase_id'])), ('refmacroarea', models2.Refmacroarea, lambda i, row: dict( pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])), ('refcountry', models2.Refcountry, lambda i, row: dict( pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])), ('spuriousreplacements', models2.Superseded, lambda i, row: dict(pk=i, languoid_pk=row['languoidbase_id'], replacement_pk=row['replacement_id'], description=row['relation'])), ('justification', common.ValueSetReference, lambda i, row: dict( pk=i, valueset_pk=valuesets['%s%s' % ('fc' if row[ 'type'] == 'family' else 'sc', row['languoidbase_id'])], source_pk=row['refbase_id'], description=row['pages'])), ]: insert(db, table, model, value)
def main(args): """ The case is we have to codings for two different dialects (called hua and yagaria) of the same iso "qgr", both of which we want to keep and keep separately. I had missed that when making NTS, rigging everything so that the iso would be the id, which is not sufficient. Glottocodes in Grambank would have taken care of it except the dialect division for yaga1260 is wrong, having yagaria as overarching and Hua under it (reality has it that Hua and Yagaria are two dialects of the same language, which has no name). So a solution with glottocodes would have to wait until we fix that or need another fix later. So I guess, for now, let's ignore qgr (and its datapoints) and I'll fix on my end later. """ data = Data( created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() dtab = partial(_dtab, args.data_file()) #Languages tabfns = ['%s' % fn.name for fn in args.data_file().glob('nts_*.tab')] args.log.info("Sheets found: %s" % tabfns) ldps = [] lgs = {} nfeatures = Counter() nlgs = Counter() for fn in tabfns: for ld in dtab(fn): if ld['language_id'] == 'qgr': continue if "feature_alphanumid" not in ld: args.log.info("NO FEATUREID %s %s" % (len(ld), ld)) if not ld["feature_alphanumid"].startswith("DRS") \ and ld["feature_alphanumid"].find(".") == -1: ldps.append(dp_dict(ld)) lgs[ld['language_id']] = unescape(ld['language_name']) if ld["value"] != "?": nfeatures.update([ld['language_id']]) nlgs.update([ld['feature_alphanumid']]) ldps = sorted(ldps, key=lambda d: d['feature_alphanumid']) lgs["ygr"] = "Hua" for lgid, lgname in lgs.items(): data.add( models.ntsLanguage, lgid, id=lgid, name=lgname, representation=nfeatures.get(lgid, 0)) DBSession.flush() load_families(data, [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['ntsLanguage'].values()], isolates_icon='tcccccc') #glottolog = Glottolog() #for lg in data['ntsLanguage'].values(): # print lg.id, NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id) # gl_language = glottolog.languoid(NOCODE_TO_GLOTTOCODE.get(lg.id, lg.id)) # if not gl_language.family: # family = data.add(Family, gl_language.id, id = gl_language.id, name = gl_language.name, description=common.Identifier(name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) # lg.family = family #Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, name=domain) DBSession.flush() #Designers for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")): designer_id = str(i + 1) data.add( models.Designer, info['designer'], id=designer_id, name=designer_id, domain=info["domain"], contributor=info['designer'], pdflink=info["pdflink"], citation=info["citation"]) DBSession.flush() #Sources for k, (typ, bibdata) in [ ktfbib(bibsource) for ld in ldps if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,") ]: if k not in data["Source"]: data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata))) DBSession.flush() #Features fs = [(fid, mergeds(lds)) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdesc = [(fid, [(ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds if ld.get("feature_possible_values")]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])] fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc] fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1] for (fid, vdescs) in fvmis: print fid, "DIFF VDESC" for (vd, fromf) in vdescs: print vd, set(fromf) for _, dfsids in groupby( sorted((f.get('feature_name', fid), fid) for fid, f in fs), key=lambda t: t[0]): assert len(list(dfsids)) == 1 for fid, f in fs: if not fid.isdigit(): args.log.info("NO INT FID %s" % f) feature = data.add( models.Feature, fid, id=fid, name=f.get('feature_name', f['feature_alphanumid']), doc=f.get('feature_information', ""), vdoc=f.get('feature_possible_values', ""), representation=nlgs.get(fid, 0), designer=data["Designer"][f['designer']], dependson=f.get("depends_on", ""), abbreviation=f.get("abbreviation", ""), featuredomain=data['FeatureDomain'][f["feature_domain"]], name_french=f.get('francais', ""), clarification=f.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", ""), alternative_id=f.get("old feature number", ""), jl_relevant_unit=f.get("relevant unit(s)", ""), jl_function=f.get("function", ""), jl_formal_means=f.get("formal means", ""), sortkey_str="", sortkey_int=int(fid)) vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")] vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist} vdesc.setdefault('?', 'Not known') if 'N/A' not in vdesc and feature.dependson: vdesc["N/A"] = "Not Applicable" vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))} vicons = icons.iconize(vi.keys()) for v, desc in vdesc.items(): data.add( common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": vicons[v]}, number=vi[v], parameter=feature) DBSession.flush() for ((f, lg), ixs) in grp2( [((ld['feature_alphanumid'], ld['language_id']), i) for i, ld in enumerate(ldps)]): ixvs = set([ldps[ix]['value'] for ix in ixs]) if len(ixvs) == 1: continue args.log.warn( "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile']) for ix in ixs])) print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs]) errors = {} done = set() for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if id_ in done: continue if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: if not ld["value"].strip(): continue info = ( ld['feature_alphanumid'], ld.get('feature_name', "[Feature Name Lacking]"), ld['language_id'], ld['value'], ld['fromfile']) msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info args.log.error(msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']]))) print msg.format(sorted( [y for (x, y) in data['DomainElement'].keys() if x == ld['feature_alphanumid']])) errors[(ld['feature_alphanumid'], ld['language_id'])] = info continue vs = common.ValueSet( id=id_, language=language, parameter=parameter, source=ld["source"] or None, contribution=parameter.designer) models.ntsValue( id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata}, comment=ld["comment"], valueset=vs, contributed_datapoint=ld["contributor"]) done.add(id_) if not ld.get('bibsources'): if 'bibsources' not in ld: args.log.warn("no bibsource %s" % ld) continue for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]: common.ValueSetReference(valueset=vs, source=data['Source'][k]) DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['ntsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"]) #, ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()), "nts.cldf", encoding = "utf-8") #utf-16 "Comment", "Source", "Bibliographical Details" #cldf = {} #for ld in ldps: # parameter = data['Feature'][ld['feature_alphanumid']] # language = data['ntsLanguage'][ld['language_id']] # id_ = '%s-%s' % (parameter.id, language.id) # if not id_ in done: # continue # dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"], ld["source"], bibliographical_details(ld.get('bibsources', "").split(",,,")), ld.get("feature_information", ""), ld.get('feature_possible_values', ""), ld["designer"], ld.get("abbreviation", ""), ld["feature_domain"], ld.get('francais', ""), ld.get("dependencies", ""), ld.get("draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)", "")) # cldf[dt] = None #savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment", "Source", "Bibliographical Details", "Feature Information", "Feature Possible Values", "Feature Designer", "Feature Abbreviation", "Feature Domain", "Feature (French)", "Feature Dependencies", "Feature Clarifying Comments")] + cldf.keys()), "nts-with-metadata.tsv", encoding="utf-16") args.log.info('%s Errors' % len(errors)) dataset = common.Dataset( id="NTS", name='Nijmegen Typological Survey', publisher_name="Max Planck Institute for Psycholinguistics", publisher_place="Nijmegen", publisher_url="http://www.mpi.nl", description="""Dataset on Typological Features, collected 2013-2014 in the Language and Cognition Department at the Max Planck Institute for Psycholinguistics, Max-Planck Gesellschaft, and a European Research Council's Advanced Grant (269484 "INTERACT") to Stephen C. Levinson.""", domain='http://nts.clld.org', published=date(2014, 2, 20), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'}) for i, contributor in enumerate([ common.Contributor( id="Harald Hammarstrom", name="Harald Hammarstrom", email="*****@*****.**"), common.Contributor( id="Suzanne van der Meer", name="Suzanne van der Meer", email="*****@*****.**"), common.Contributor( id="Hedvig Skirgard", name="Hedvig Skirgard", email="*****@*****.**") ]): common.Editor(dataset=dataset, contributor=contributor, ord=i) DBSession.add(dataset)
def main(args): citations.main(args) data = Data() pairs = {} languages = {} coords = {} for lang in dsv.rows( args.data_file('MB_Map_Data_Aug13WLabels'), namedtuples=True, newline='\n', encoding='latin1' ): coords[slug(lang.Label.split('<')[0].strip())] = ( float(lang.y), float(lang.x)) xls = xlrd.open_workbook(args.data_file('MB_BoCatSum_AFBO.xlsx')) matrix = xls.sheet_by_name('MB_BoCatSum_AFBO.txt') md = "area\trecipient language iso\trecipient language genus\tdonor language iso\tdonor language genus".split('\t') fields = [] params = [] for i in range(matrix.ncols): colname = xlrd.colname(i) if len(colname) == 2 and colname > 'BE': break colval = matrix.cell(0, i).value.strip() if (len(colname) == 1 and colname > 'G') or (len(colname) == 2 and colname < 'AY'): params.append(colval) fields.append(colval) else: fields.append(colval.lower()) for f in fields: if fields.count(f) > 1: print(f) assert len(fields) == len(set(fields)) for j in range(1, matrix.nrows): values = dict(zip(fields, [matrix.cell(j, i).value for i in range(matrix.ncols)])) try: id_ = int(values['perm.id']) except: continue pairs[id_] = values for type_ in ['recipient', 'donor']: languages[values[type_ + ' language'].strip()] = { 'macroarea': values['area']} for md in ['iso', 'genus']: languages[values[type_ + ' language'].strip()][md] \ = values['%s language %s' % (type_, md)] for name in COORDS: assert name in languages sources = {} with open(args.data_file('MB_Case_List_with_links.html')) as fp: worddoc = fp.read() for m in re.finditer('\"__(?P<recid>[^_]+)__\"', worddoc): sources[m.group('recid').decode('utf8')] = 1 soup = bs(worddoc) doc = {} cols = [] table = soup.find('table') for tr in table.children: if tr.name != 'tr': continue tds = filter(lambda n: n.name == 'td', tr.children) if not cols: cols = map(text, tds) else: values = dict(zip(cols, tds)) try: id_ = int(text(values['perm.id'])) doc[id_] = values if id_ in pairs: assert doc['Recipient lg.'] == pairs[id_][1]['recipient language'] assert doc['Don'] == pairs[id_][1]['donor language'] except: continue dataset = common.Dataset( id='afbo', name="AfBo: A world-wide survey of affix borrowing", contact="*****@*****.**", domain="afbo.info", license='http://creativecommons.org/licenses/by/3.0/', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) for i, spec in enumerate([('seifart', "Frank Seifart")]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) contrib = data.add(common.Contribution, 'afbo', name="AfBo", id="afbo") iso_map = { ('ron', 'Meglenite Romanian'): ('ruq', None), ('fra', 'Norman French'): ('xno', None), ('tur', 'Turkic'): (None, 'turk1311'), ('xuu', 'Kxoe languages'): (None, 'khoe1241'), ('zoc', 'Zoquean languages'): (None, 'zoqu1261'), ('tzm', 'Moroccan Berber languages'): (None, 'atla1275'), ('cvn', 'Quechua'): ('qvn', None), ('rop', 'Gurindji Kriol'): (None, 'guri1249'), ('ita', 'Sicilian Italian'): ('scn', None), ('srp', 'Croatian'): ('hrv', None), ('eme', 'Wayampi‑Emerillon‑Zo’é'): (None, 'waya1271'), ('ale', 'Copper Island Aleut'): ('mud', None), ('car', 'intermediate Proto‑Carib'): (None, 'cari1283'), ('ell', 'Cappadocian Greek'): ('cpg', None), ('eng', 'Middle English'): ('enm', None), ('als', 'Arvanitic Albanian'): ('aat', None), ('nys', 'Northern Nyungic'): (None, 'dese1234'), ('ron', 'Istro‑Romanian'): ('ruo', None), ('chf', 'Cho’ol'): ('ctu', None), ('tuo', 'Eastern Tucanoan languages'): (None, 'east2698'), ('ceb', 'Visayan'): (None, 'bisa1268'), ('por', 'Sri Lanka Portuguese'): (None, 'mala1544'), ('brx', 'Tibeto-Burman languages'): (None, 'brah1260'), } with open('name_conflicts.tab', 'w') as fp: fp.write('iso\tafbo\tglottolog\tproposed iso\n') for i, name in enumerate(languages.keys()): md = languages[name] iso = md.pop('iso') if iso == 'cvn' and name == 'Quechua': iso = 'qvn' kw = dict(name=name, id=str(i+1), jsondata=md) if name in COORDS: kw['latitude'], kw['longitude'] = COORDS[name] elif slug(name) in coords: kw['latitude'], kw['longitude'] = coords[slug(name)] elif glottocoords.get(iso): kw['latitude'], kw['longitude'] = glottocoords[iso] if glottonames.get(iso) and slug(glottonames.get(iso)) != slug(name): fp.write(('%s\t%s\t%s\t%s\n' % ( iso, name, glottonames.get(iso), rglottonames.get(slug(name), ''))).encode('utf8')) if name == 'Meglenite Romanian': kw['name'] = 'Megleno Romanian' if not 'latitude' in kw: print(name) l = data.add(common.Language, name, **kw) iso, gc = iso_map.get((iso, name), (iso, None)) for code, type_ in [ (iso, common.IdentifierType.iso), (gc or glottocodes.get(iso), common.IdentifierType.glottolog) ]: if code: identifier = data.add( common.Identifier, code, id=code, name=code, type=type_.value) data.add( common.LanguageIdentifier, '%s-%s' % (code, l.id), identifier=identifier, language=l) include = sources.keys() + [ 'myersscottoncontact2002', 'myersscottonlanguage2007', 'meakinsborrowing2011', 'seifartprinciple2012', ] refdb = bibtex.Database.from_file(args.data_file('FSeifartZoteroLibrary14Nov2013.bib')) for rec in refdb: if slug(rec.id) in include: data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for i, name in enumerate(params): data.add(models.AffixFunction, name, id=str(i + 1), name=name) for id_, vd in pairs.items(): assert id_ in doc donor = data['Language'][vd['donor language'].strip()] recipient = data['Language'][vd['recipient language'].strip()] p = data.add( models.Pair, id_, id=str(id_), name=vd['pairs'].replace('Meglenite', 'Megleno'), area=recipient.jsondata['macroarea'], description=unicode(doc[id_]['comment']).replace('<h1', '<p').replace('</h1>', '</p>').replace('Meglenite', 'Megleno'), reliability=vd['reliability'], int_reliability=['high', 'mid', 'low'].index(vd['reliability']), count_interrel=int(vd[u'number of interrelated affixes']), count_borrowed=int(vd['number of borrowed affixes']), donor=donor, recipient=recipient) DBSession.flush() for i, param in enumerate(params): param_id = i + 1 value = vd[param] if value != '': vsid = '%s-%s' % (recipient.id, param_id) if vsid in data['ValueSet']: vs = data['ValueSet'][vsid] else: vs = data.add( common.ValueSet, vsid, id=vsid, parameter=data['AffixFunction'][param], language=recipient, contribution=contrib) data.add( models.waabValue, '%s-%s' % (id_, param_id), id='%s-%s' % (id_, param_id), pair=p, name='%s' % int(value), numeric=int(value), description='%s' % p, valueset=vs)
def main(args): data = Data() glottocodes, bibtex_keys = {}, defaultdict(set) for d in reader( args.data_file('repos', 'mappings', 'InventoryID-ISO-gcode-Bibkey-Source.tsv')): glottocodes[d['InventoryID']] = d['Glottocode'] bibtex_keys[d['InventoryID']].add(d['BibtexKey']) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} phonemes = sorted(list( reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))), key=lambda r: (r['InventoryID'], r['GlyphID'])) inventories = defaultdict(set) for p in phonemes: if p['InventoryID'] in glottocodes: inventories[(languoids[glottocodes[p['InventoryID']]].name, p['SpecificDialect'], p['Source'].upper())].add( (p['InventoryID'], p['LanguageName'])) inventory_names = {} for (glname, dname, source), invids in inventories.items(): if len(invids) == 1: invid, lname = invids.pop() inventory_names[invid] = name_in_source(glname, dname) + ' [%s]' % source else: use_lname = len(set(r[1] for r in invids)) == len(invids) for i, (invid, lname) in enumerate(sorted(invids, key=lambda j: int(j[0]))): disambiguation = ' %s' % (i + 1, ) if use_lname: disambiguation = ' (%s)' % lname inventory_names[invid] = name_in_source( glname, dname) + '%s [%s]' % (disambiguation, source) for (invid, lname, dname, source), ps in groupby( phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[ 'SpecificDialect'], p['Source'])): if invid not in glottocodes: continue ps = list(ps) gc = glottocodes[invid] lang = data['Variety'].get(gc) if not lang: languoid = languoids[gc] lang = data.add( models.Variety, gc, id=gc, language_code=ps[0]['LanguageCode'], name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude, ) if lang.latitude is None and languoid.level == Level.dialect: ll = get_language(languoid) lang.latitude = ll.latitude lang.longitude = ll.longitude contrib = data.add( models.Inventory, invid, id=invid, #language=lang, source=source, #source_url=source_urls.get(row.InventoryID), #internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[invid], description=name_in_source(lname, dname)) return # FIXME: read from mappings file! refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) #squibs = defaultdict(list) #for row in get_rows(args, 'Squib'): # squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), delimiter='\t', namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) # pull in Glottolog families instead? or in addition? family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) #for j, squib in enumerate(squibs.get(row.InventoryID, [])): # f = common.Contribution_files( # object=contrib, # id='squib-%s-%s.pdf' % (contrib.id, j + 1), # name='Phonological squib', # description=squib, # mime_type='application/pdf') # assert f # # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) # FIXME: add allophones! DBSession.flush()
def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Unit.name)).create(DBSession.bind) data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: print('no md', submission.id) continue if not md['date_published']: print('no date', submission.id) continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': print('not selected', submission.id) continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] props.setdefault('choices', {}) language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), doi=md.get('doi'), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=spec.get('primary', True), contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_sources(Dictionary.get(did), dictdata) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog/glottolog')
def main(args): data = Data() data.add( common.Contributor, 'barthwolfgang', id='barthwolfgang', name="Wolfgang Barth", url="http://www.dynamicsoflanguage.edu.au/") dataset = common.Dataset( id='parabank', name='Parabank', description='Database of kinship terminology', domain='parabank.clld.org', publisher_name="CoEDL Centre of Excellence for the Dynamics of Language", publisher_place="Canberra, Australia", publisher_url="http://www.dynamicsoflanguage.edu.au/", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by.png', 'license_name': 'Creative Commons Attribution 4.0'}) DBSession.add(dataset) for i, editor in enumerate(['barthwolfgang']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for langu in reader(DATA_DIR.joinpath('data_basics', 'all_languages.txt'), delimiter=';', dicts=True): #print(langu) data.add(common.Language, langu['glotto'], id=langu['glotto'], name=langu['language name'], latitude=float(langu['latitude']), longitude=float(langu['longitude']), description=langu['comment'],) lang_dict = defaultdict(dict) for fname in DATA_DIR.joinpath('data_open_office').glob('*.txt'): for item in reader(fname, delimiter=';', dicts=True, encoding='utf-8-sig'): if item['parameter'] in lang_dict[item['glottocode']]: if lang_dict[item['glottocode']][item['parameter']] != item['word']: print(fname, item['glottocode'], item['parameter']) #print(lang_dict[item['glottocode']][item['parameter']], item['word']) continue continue lang_dict[item['glottocode']][item['parameter']] = item['word'] #print(fname) #print(fname, item['glottocode'], item['parameter']) lang = data['Language'][item['glottocode']] param = data['Parameter'].get(item['parameter']) if not param: param = data.add( common.Parameter, item['parameter'], id=item['parameter'], name=item['parameter'], description=item['description']) id_ = item['parameter'] + "-" + item['glottocode'] vs = models.ParabankValueSet( id=id_, language=lang, parameter=param, contribution=contrib) DBSession.add(models.Word( id=id_, name=item['word'], ipa=item['ipa'], alternative=item.get('alternative'), comment=item.get('comment'), valueset=vs)) for i, (name, desc, params) in enumerate([ ("patrichild", "male speaker's children and children of brothers", "mS mD meBS meBD myBS myBD " "feBS feBD fyBS fyBD"), ("matrichild", "female speaker's children and children of sisters", "fS fD feZS feZD fyZS fyZD " "meZS meZD myZS myZD"), ("grandfathers", "all grandparents have the same address term", "mFF mMF fFF fMF"), ("sisters", "all sisters have the same address term", "meZ myZ feZ fyZ"), ("brothers", "all brothers have the same address term", "meB myB feB fyB"), ("father-in-law", "all fathers-in-law have the same address term", "fHF mWF"), ]): params = params.split() syncretism = models.Syncretism( id='%s' % (i + 1,), name=name, description=desc, notation='(%s)' % ', '.join(params)) for lang, words in lang_dict.items(): if has_syncretism(words, *params): syncretism.languages.append(data['Language'][lang]) for i, (name, desc, partition) in enumerate([ [ "sons vs. daughters", "Children are in two groups depending on their gender", "(mS, fS) (mD, fD)"], [ "parents by gender and one term for all siblings of parents", "simple", "(mF, fF) " "(mM, fM) " "(mFeB, mFyB, mFeZ, mFyZ, mMeB, mMyB, mMeZ, mMyZ, mFeB, fFyB, fFeZ, fFyZ, fMeB, fMyB, fMeZ, fMyZ)"], [ "Hawaiian Kinship System", "Differences are distinguished by generation and by gender", "(meB, myB, mFBS, mFZS, mMBS, mMZS, feZ, fyZ, fFBD, fFZD, fMBD, fMZD) " "(meZ, myZ, mFBD, mFZD, mMBD, mMZD) (feB, fyB, fFBS, fFZS, fMBS, fMZS) " "(mF, fF, mFeB, mFyB, mMeB, mMyB, fFeB, fFyB, fMeB, fMyB) " "(mM, fM, mFeZ, mFyZ, mMeZ, mMyZ, fFeZ, fFyZ, fMeZ, fMyZ)"], [ "#3 siblings: gender division", "Siblings are in two groups depending on the gender", "(meZ, myZ, feZ, fyZ) (meB, myB, feB, fyB)"], [ "#5 siblings: age division", "Siblings are in two groups depending on the relative age to the speaker", "(meB, meZ, feB, feZ) (myB, myZ, fyB, fyZ)"], [ "#4 siblings: male speaker / female speaker", "Siblings are in two groups depending on the gender of speaker", "(meB, myB, meZ, myZ) (feB, fyB, feZ, fyZ)"], [ "#2 siblings: gender division plus meB and fyZ", "Siblings are in two groups: male/female, where male elder brother and female younger sister have own term", "(meB) (myB, feB, fyB) (meZ, myZ, feZ) (fyZ)"], [ "#1 siblings: elder brother / younger brother / elder sister / younger sister", "Siblings are in four groups distinguished by age and gender", "(meB, feB) (myB, fyB) (meZ, feZ) (myZ, fyZ)"], [ "#6 siblings: four groups: gender of speaker, gender of sibling", "Siblings are in four groups distinguished by gender of speaker and gender of sibling", "(meB, myB) (feB, fyB) (meZ, myZ) (feZ, fyZ)"], [ "#7 siblings: eight terms", "Each sibling is addressed differently", "(meB) (myB) (feB) (fyB) (meZ) (myZ) (feZ) (fyZ)"], [ "#8 siblings: gender of sibling plus distinction of male/female speaker for brothers", "distinction by gender and brother distinction by gender of speaker", "(meB, myB) (feB, fyB) (meZ, myZ, feZ, fyZ)"], [ "#9 siblings: gender of sibling plus distinction of male/female speaker for sisters", "distinction by gender and sister distinction by gender of speaker", "(meB, myB, feB, fyB) (meZ, myZ) (feZ, fyZ)"], [ "#10 siblings: gender distinction plus same gender by age", "distinction by gender and same gender siblings are distinguished by age", "(meB) (myB, fyZ) (feB, fyB) (meZ, myZ) (feZ)"], [ "#11 siblings: age distinction plus older siblings by gender", "distinction by gender and same gender siblings are distinguished by age", "(meB, feB) (myB, fyB, myZ, fyZ) (meZ, feZ)"], [ "#12 siblings: gender division plus age division for brothers", "gender division plus age division for brothers", "(meB, feB) (myB, fyB) (myZ, fyZ, meZ, feZ)"], [ "#13 siblings: age division plus sex of speaker division for older siblings", "gender division plus age division for brothers", "(meB, meB) (feB, feZ) (myB, myZ, fyB, fyZ)"], [ "#14 siblings: cross gender distinction plus age distinction in same gender", "cross gender plus age in same gender", "(meB, feZ) (myB, fyZ) (feB, fyB, meZ, myZ)"], [ "#15 siblings: one term for same gender sibling, cross gender divided in male and female", "one term for same gender sibling cross gender divided in male and female", "(meB, myB, feZ, fyZ) (feB, fyB) (meZ, myZ)"], [ "#16 siblings: one term for cross gender sibling same gender divided in male and female", "one term for cross gender sibling same gender divided in male and female", "(meB, myB) (feZ, fyZ) (feB, fyB, meZ, myZ)"], [ "#17 siblings", "complex", "(meB) (myB, fyB) (feZ) (fyZ) (feB) (meZ) (myZ)"], [ "#18 siblings", "complex", "(meB) (myB, fyB, fyZ) (feZ) (feB) (meZ) (myZ)"], [ "#19 siblings", "complex", "(meB) (myB, fyB, feB) (feZ, fyZ) (meZ, myZ)"], [ "#20 siblings", "complex", "(meB, feB) (myB, fyB, myZ, fyZ) (meZ) (feZ)"], [ "#21 siblings: age and gender distinction plus younger sister by sex of speaker", "age and gender distinction plus younger sister by sex of speaker", "(meB, feB) (myB, fyB) (meZ, feZ) (myZ) (fyZ)"], [ "#22 siblings", "complex", "(meB, feB) (myB) (fyB) (meZ, feZ) (myZ) (fyZ)"], [ "#23 siblings", "complex", "(meB) (feB) (myB) (fyB) (meZ) (feZ) (myZ, fyZ)"], [ "#24 siblings: brothers and age distinction in sisters", "brothers and age distinction in sisters", "(meB, feB, myB, fyB) (meZ, feZ) (myZ, fyZ)"], [ "#25 siblings: one term for all", "one term for all siblings", "(meB, feB, myB, fyB, meZ, feZ, myZ, fyZ)"], [ "#26 siblings: cross vs. same sex sibling", "two terms: one for cross one for same sex sibling", "(meB, myB, feZ, fyZ) (meZ, myZ, feB, fyB)"], [ "#27 siblings: age distinction plus gender in older siblings plus speaker distinction for older brother", "age and gender plus speaker distinction for older brother", "(meB) (myB, fyB, myZ, fyZ) (feB) (meZ, feZ)"], ]): pattern = models.Pattern( id='%s' % (i + 1,), name=name, description=desc, notation=partition) param_groups = [ re.split(',\s*', group) for group in re.split('\s*\)\s*\(\s*', partition.strip()[1:-1])] for lang, words in lang_dict.items(): if has_pattern(words, *param_groups): pattern.languages.append(data['Language'][lang]) for i, (name, desc, params) in enumerate([ [ "parents, aunts & uncles", "all kinship terms for father, mother and their siblings", ["mF", "mM", "fF", "fM", "mFeB", "mFyB", "mFeZ", "mFyZ", "mMeB", "mMyB", "mMeZ", "mMyZ", "fFeB", "fFyB", "fFeZ", "fFyZ", "fMeB", "fMyB", "fMeZ", "fMyZ"]], [ "siblings", "all brothers and sisters", ["meB", "myB", "meZ", "myZ", "feB", "fyB", "feZ", "fyZ"]], [ "cousins - no age distinction", "all children of parent's siblings", ["mFBS", "mFBD", "mFZS", "mFZD", "mMBS", "mMBD", "mMZS", "mMZD", "fFBS", "fFBD", "fFZS", "fFZD", "fMBS", "fMBD", "fMZS", "fMZD"]], [ "cousins - age distinction depends on relative age between ego and cousin", "all children of parent's siblings by relative age between ego and cousin", ["mFBeS", "mFBeD", "mFZeS", "mFZeD", "mMBeS", "mMBeD", "mMZeS", "mMZeD", "mFByS", "mFByD", "mFZyS", "mFZyD", "mMByS", "mMByD", "mMZyS", "mMZyD", "fFBeS", "fFBeD", "fFZeS", "fFZeD", "fMBeS", "fMBeD", "fMZeS", "fMZeD", "fFByS", "fFByD", "fFZyS", "fFZyD", "fMByS", "fMByD", "fMZyS", "fMZyD"]], [ "cousins - age distinction depends on relative age between parent of ego and parent of cousin", "all children of parent's siblings by relative age of parents", ["mFeBS", "mFeBD", "mFeZS", "mFeZD", "mMeBS", "mMeBD", "mMeZS", "mMeZD", "mFyBS", "mFyBD", "mFyZS", "mFyZD", "mMyBS", "mMyBD", "mMyZS", "mMyZD", "fFeBS", "fFeBD", "fFeZS", "fFeZD", "fMeBS", "fMeBD", "fMeZS", "fMeZD", "fFyBS", "fFyBD", "fFyZS", "fFyZD", "fMyBS", "fMyBD", "fMyZS", "fMyZD"]], [ "grandparents & grandchildren", "all direct ancestors of the grandparent generation & all direct " "descendants of the grandchildren generation", ["mFF", "mFM", "mMF", "mMM", "fFF", "fFM", "fMF", "fMM", "mSS", "mSD", "mDS", "mDD", "fSS", "fSD", "fDS", "fDD"]], [ "sons & daughters, nieces & nephews", "all children of ego and ego's siblings", ["mS", "mD", "fS", "fD", "meBS", "meBD", "myBS", "myBD", "meZS", "meZD", "myZS", "myZD", "feBS", "feBD", "fyBS", "fyBD", "feZS", "feZD", "fyZS", "fyZD"]], [ "In-laws & affines", "relatives through marriage", ["mW", "fH", "fHF", "fHM", "mWF", "mWM", "mSW", "mDH", "fSW", "fDH"]], ]): paradigm = models.Paradigm(id='%s' % (i + 1,), name=name, description=desc) for param in params: paradigm.parameters.append(data['Parameter'][param])
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) for i, name in enumerate( ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add(common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file(data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add( common.Source_files(mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add(models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add( models.Relation(source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference(source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add(common.Contributor, name, id=slug(name), name=name) DBSession.add( common.ContributionContributor(ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split( ): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add( common.Contribution_data(object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider(id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta(metaprovider=provider, conceptset=data['ConceptSet'] [meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def main(args): data = Data() lotw_conn = sqlite3.connect("lotw_base.sqlite") lotw_base = lotw_conn.cursor() contrib = common.Contribution(id="initial_contrib", name="Initial contribution") dataset = common.Dataset(id=lotw_dev.__name__, domain='lotw_dev.clld.org', name="Languages of the World", publisher_name="IL RAS", publisher_place="Moscow", publisher_url="http://iling-ran.ru/main/", jsondata={ 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) feature_dict = {} unnamed_feature_count = 0 features = lotw_base.execute("SELECT * FROM Feature").fetchall() names = [y[2] for y in features] feat_name_counts = {x[2]: [names.count(x[2]), 0] for x in features if names.count(x[2]) > 1} # features = [convert_feature(x) for x in features] for feature in features: name = feature[2] # if name == ".О": # continue if name in feat_name_counts.keys(): temp_name = name name += ("_" + str(feat_name_counts[name][1])) feat_name_counts[temp_name][1] += 1 feature_dict[feature[0]] = TreeFeature(pk=feature[0], id=feature[0], name=name, father_pk=feature[5]) print("Added feature %s" % feature[2]) langs = lotw_base.execute("SELECT * FROM Language").fetchall() assert len(set([lang[0] for lang in langs])) == len([lang[0] for lang in langs]) for language in langs: value_sets = [] geodata = lotw_base.execute("SELECT * FROM Geographical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famdata = lotw_base.execute("SELECT * FROM Genealogical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famname = lotw_base.execute("SELECT * FROM Family where Id=?", (famdata[2], )).fetchone()[1] branchname =lotw_base.execute("SELECT * FROM Branch where Id=?", (famdata[3], )).fetchone()[1] if not geodata: geodata = [0.0 for x in range(7)] data.add(lotw_devLanguage, language[0], id=str(language[0]), iso=language[3], family=famname, branch=branchname, name=language[1], latitude=geodata[5], longitude=geodata[6]) print("Added language %s" % language[3]) # Lang_id=language["Lang_id"], Order_of_addition=language["Order_of_addition"], # Sorting_number=language["Sorting_number"], Code_ISO_639_3=language["Code_ISO_639_3"] language_features = lotw_base.execute("SELECT * FROM Binary_data WHERE Lang_id=? AND Feature_value=1", (str(language[0]), )) for l_feat in language_features.fetchall(): feat_id = l_feat[0] try: feat_name = feature_dict[l_feat[2]].name except KeyError: continue vs = common.ValueSet(id=feat_id, language=data["lotw_devLanguage"][language[0]], parameter=feature_dict[l_feat[2]], contribution=contrib) DBSession.add(common.Value(id=feat_id, name=feat_name, valueset=vs)) print("Added value %s" % feat_id) lotw_conn.close()
def main(args): # http://clld.readthedocs.org/en/latest/extending.html data = Data( created=utc.localize(datetime(2013, 11, 15)), updated=utc.localize(datetime(2013, 12, 12))) icons = issues.Icons() languoids = list(Glottolog(GLOTTOLOG_REPOS).languoids()) iso_to_gc = dict([(l.iso, l.id) for l in languoids]) #glottocodes = glottocodes_by_isocode(args.glottolog_dburi) iso_to_name = {l.iso: l.name for l in languoids} #Languages dp = dtab("dp.tab") lons = dict([(d['iso-639-3'], d['lon']) for d in dp]) lats = dict([(d['iso-639-3'], d['lat']) for d in dp]) tabfns = [fn.name for fn in DATA_DIR.glob('sails_*.tab')] print "Sheets found", tabfns ldps = [ld for fn in tabfns for ld in dtab(fn)] ldps = [dict([(k, v.replace(".", "-") if k in ['feature_alphanumid', 'value'] else v) for (k, v) in ld.iteritems()]) for ld in ldps] ldcps = dtab("constructions_data.tab") dedup = opv(grp2([((ld['construction_id'], ld['feature_alphanumid'].replace('.', "-")), (ld["value"],) + tuple(ld.items())) for ld in ldcps]), max) dldps = [dict(dld[1:]) for dld in dedup.itervalues()] lgs = dict([(ld['language_id'], ld['language_name'] if ld.has_key('language_name') else iso_to_name[ld['language_id']]) for ld in ldps + ldcps]) nfeatures = opv(grp2([(ld['language_id'], ld['feature_alphanumid']) for ld in ldps + ldcps if ld["value"] != "?"]), len) # Families fp = treetxt(loadunicode('lff.txt') + loadunicode('lof.txt')) ps = paths(fp) lg_to_fam = dict([(p[-1], p[0]) for p in ps]) families = grp2([(lg_to_fam[lg], lg) for lg in lgs.keys()]) ficons = dict(icons.iconizeall([ f for (f, sailslgs) in families.iteritems() if len(sailslgs) != 1]).items() + [(f, icons.graytriangle) for (f, sailslgs) in families.iteritems() if len(sailslgs) == 1]) for family in families.iterkeys(): data.add( models.Family, family, id=family, name=family, jsondata={"icon": ficons[family]}) DBSession.flush() # Lgs for lgid in lgs.iterkeys(): lang = data.add( models.sailsLanguage, lgid, id=lgid, name=lgs[lgid], family=data["Family"][lg_to_fam[lgid]], representation=nfeatures[lgid], latitude=float(lats[lgid]), longitude=float(lons[lgid])) if not lgid.startswith('NOCODE'): iso = data.add( common.Identifier, lgid, id=lgid, name=lgid, type=common.IdentifierType.iso.value, description=lgs[lgid]) data.add(common.LanguageIdentifier, lgid, language=lang, identifier=iso) if lgid in iso_to_gc: gc = iso_to_gc[lgid] gc = data.add( common.Identifier, 'gc' + lgid, id=gc, name=gc, type=common.IdentifierType.glottolog.value, description=lgs[lgid]) data.add(common.LanguageIdentifier, lgid, language=lang, identifier=gc) DBSession.flush() # Domains for domain in set(ld['feature_domain'] for ld in ldps): data.add(models.FeatureDomain, domain, id=slug(domain), name=domain) DBSession.flush() designer_to_id = {} for dd in dtab("sailscontributions.tab"): contributionid = slug("%s-%s" % (dd["designer"], dd["domain"])) if dd["domain"].find("Construction-Based") == -1: designer_to_id[dd["designer"]] = contributionid contribution_statistics = {} contribution_statistics["nfeatures"] = opv(grp2([(designer_to_id[ld["designer"]], ld['feature_alphanumid']) for ld in ldps]), len) contribution_statistics["nlanguages"] = opv(grp2([(designer_to_id[ld["designer"]], ld['language_id']) for ld in ldps]), len) contribution_statistics["ndatapoints"] = opv(grp2([(designer_to_id[ld["designer"]], (ld['feature_alphanumid'], ld['language_id'])) for ld in ldps if ld["value"] != "?"]), len) contributionid = slug("%s-%s" % ("Rik van Gijn", "Construction-Based Subordination Data (SUB)")) contribution_statistics["nfeatures"][contributionid] = len(set([(ld['feature_alphanumid']) for ld in dldps])) contribution_statistics["nlanguages"][contributionid] = len(set([(ld['language_id']) for ld in dldps])) contribution_statistics["ndatapoints"][contributionid] = len(set([(ld['feature_alphanumid'], ld['language_id']) for ld in ldps if ld["value"] != "?"])) # Designers citation_template = "%s. 2014. %s. In Muysken, Pieter et al. (eds.) "\ "South American Indian Language Structures (SAILS) Online. Leipzig: Online "\ "Max Planck Institute of Evolutionary Anthropology. "\ "(Available at http://sails.clld.org)" #for (designer_id, (designer, domain)) in enumerate(designers.iteritems()): designer_to_id = {} for dd in dtab("sailscontributions.tab"): contributionid = slug("%s-%s" % (dd["designer"], dd["domain"])) orientation = "Language-Based" if dd["domain"].find("Construction-Based") == -1: designer_to_id[dd["designer"]] = contributionid orientation = "Construction-Based" data.add( models.Designer, contributionid, id=contributionid, name=contributionid, domain=dd["domain"], orientation=orientation, contributor=dd["designer"], nlanguages=contribution_statistics["nlanguages"][contributionid], nfeatures=contribution_statistics["nfeatures"][contributionid], ndatapoints=contribution_statistics["ndatapoints"][contributionid], citation=citation_template % (dd["designer"], dd["domain"]), more_information=dd["citation"], pdflink=dd["pdflink"]) DBSession.flush() # Features fs = dict([(ld['feature_alphanumid'], ld) for ld in ldps]) nameclash_fs = grp2([(ld['feature_name'], ld['feature_alphanumid']) for ld in ldps]) fnamefix = {} for (dfeature, dfsids) in nameclash_fs.iteritems(): if len(dfsids) != 1: print "Feature name clash", dfeature, dfsids for dfsid in dfsids: fnamefix[dfsid] = dfeature + " [%s]" % dfsid nlgs = opv(grp2([(ld['feature_alphanumid'], ld['language_id']) for ld in ldps if ld["value"] != "?"]), len) (fidstr, fidint) = sortinfo(fs.keys()) for (fid, f) in fs.iteritems(): if nlgs[fid] == 0: continue data.add( models.Feature, fid, id=fid, name=fnamefix.get(fid, f['feature_name']), description=f['feature_information'], jsondata=dict(vdoc=f['feature_possible_values']), representation=nlgs[fid], designer=data["Designer"][designer_to_id[f['designer']]], dependson=f["depends_on"], featuredomain=data['FeatureDomain'][f["feature_domain"]], sortkey_str=fidstr[fid], sortkey_int=fidint[fid]) DBSession.flush() fvs = dict([(ld['feature_alphanumid'], ld['feature_possible_values']) for ld in ldps]) fvdesc = {} for (fid, vs) in fvs.iteritems(): vdesclist = [veq.split("==") for veq in vs.split("||")] try: vdesc = dict([(v.replace(".", "-"), desc) for [v, desc] in vdesclist]) except ValueError: print "Faulty value desc", vdesclist, vs if not vdesc.has_key("?"): vdesc["?"] = "Not known" if not vdesc.has_key("N/A") and fs[fid]["depends_on"]: vdesc["N/A"] = "Not Applicable" vi = dict([(v, i) for (i, v) in enumerate(sorted(vdesc.keys()))]) vicons = icons.iconize(vi.keys()) if len(vdesc) == 0: print "VDESC missing", vs, fid, v for (v, desc) in vdesc.iteritems(): fvdesc[(fid, v)] = desc data.add( common.DomainElement, (fid, v), id='%s-%s' % (fid, v), name=v, description=desc, jsondata={"icon": vicons[v]}, number=vi[v], parameter=data['Feature'][fid]) DBSession.flush() done = set() for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['sailsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if (ld['feature_alphanumid'], ld['value']) not in data['DomainElement']: print ld['feature_alphanumid'], ld['feature_name'], ld['language_id'], ld['value'], "not in the set of legal values" continue valueset = data.add( common.ValueSet, id_, id=id_, language=language, parameter=parameter, contribution=parameter.designer, source=ld["source"].strip() or None, ) data.add( models.sailsValue, id_, id=id_, domainelement=data['DomainElement'][(ld['feature_alphanumid'], ld['value'])], jsondata={"icon": data['DomainElement'][(ld['feature_alphanumid'], ld['value'])].jsondata}, description=fvdesc[(ld['feature_alphanumid'], ld['value'])], comment=ld["comment"], example=ld["example"], valueset=valueset, contributed_datapoint=ld["contributor"] ) done.add(id_) cdatapts = [dict(dld[1:]) for dld in dedup.itervalues() if dld[0].strip() and dld[0] != "?"] fccl = grp2([(ld['feature_alphanumid'].replace('.', "-"), (ld['construction_id'], ld['language_id'])) for ld in cdatapts]) fcstats = opv(fccl, lambda cls: (len(set([c for (c, l) in cls])), len(set([l for (c, l) in cls])))) fcstrs = dict([(ld['feature_alphanumid'].replace('.', "-"), ld) for ld in dtab("constructions_features.tab")]) # Construction Feature Domains for domain in set(ld['feature_domain'] for ld in fcstrs.values()): data.add(models.ConstructionFeatureDomain, domain, id=slug(domain), name=domain) DBSession.flush() (fidstr, fidint) = sortinfo(fcstrs.keys()) for (fid, ld) in fcstrs.iteritems(): (ncs, nlgs) = fcstats[fid] data.add( models.sailsUnitParameter, fid, id=fid, name=ld['feature_name'], description=ld['feature_information'], jsondata=dict(vdoc=ld['feature_possible_values']), designer=data["Designer"][slug("%s-%s" % (ld['designer'], "Construction-Based Subordination Data (SUB)"))], dependson=f["depends_on"], constructionfeaturedomain=data['ConstructionFeatureDomain'][ld["feature_domain"]], nconstructions=ncs, nlanguages=nlgs, sortkey_str=fidstr[fid], sortkey_int=fidint[fid]) DBSession.flush() #ldcps = dtab("constructions_data.tab") cs = set([(ld['construction_id'], ld['language_id']) for ld in ldcps]) for (cid, lid) in cs: language = data['sailsLanguage'][lid] data.add( models.sailsConstruction, cid, id=cid, name=cid, language = language) DBSession.flush() for ld in dldps: #dld in dedup.itervalues(): #ld = dict(dld[1:]) #print fid, ld['language_id'], ld['construction_id'], "HEJ" fid = ld['feature_alphanumid'].replace('.', "-") language = data['sailsLanguage'][ld['language_id']] construction = data['sailsConstruction'][ld['construction_id']] construction_feature = data['sailsUnitParameter'][fid] id_ = '%s-%s' % (construction.id, construction_feature.id) print ld data.add(models.sailsUnitValue, id_, id=id_, name=ld['value'], unit=construction, unitparameter=construction_feature, contribution=construction_feature.designer, source = ld["source"].strip(), comment = ld["comment"], provenance = ld["provenance"], contributed_datapoint = "Rik van Gijn") #1xf vs #contribution??!?! #TODO fixa unitvalues #fs v #lgs #cstrs TODO fixa snippet #Constrction Features #(unit-)values #done.add(id_) DBSession.flush() # Sources sources = [ktfbib(bibsource) for ld in ldps if ld.get('bibsources') for bibsource in ld['bibsources'].split(",,,")] + [ktfbib(bibsource) for dld in dldps if dld.get('bibsources') for bibsource in dld['bibsources'].split(",,,")] for (k, (typ, bibdata)) in sources: rec = Record(typ, k, **bibdata) if not data["Source"].has_key(k): data.add(common.Source, k, _obj=bibtex2source(rec)) DBSession.flush() for ld in ldps: sources = [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,") if ld.get('bibsources')] for (k, (typ, bibdata)) in sources: parameter = data['Feature'][ld['feature_alphanumid']] language = data['sailsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) data.add( common.ValueSetReference, "%s-%s" % (id_, k), valueset=data["ValueSet"][id_], source=data['Source'][k]) DBSession.flush() dataset = common.Dataset( id="SAILS", name='SAILS Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_url="http://shh.mpg.de", publisher_place="Jena", description="Dataset on Typological Features for South American Languages, collected 2009-2013 in the Traces of Contact Project (ERC Advanced Grant 230310) awarded to Pieter Muysken, Radboud Universiteit, Nijmegen, the Netherlands.", domain='sails.clld.org', published=date(2014, 2, 20), contact='*****@*****.**', license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en', jsondata={ 'license_icon': 'http://wals.info/static/images/cc_by_nc_nd.png', 'license_name': 'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'}) DBSession.add(dataset) DBSession.flush() editor = data.add(common.Contributor, "Harald Hammarstrom", id="Harald Hammarstrom", name="Harald Hammarstrom", email = "*****@*****.**") common.Editor(dataset=dataset, contributor=editor, ord=0) DBSession.flush() #To CLDF cldf = {} for ld in ldps: parameter = data['Feature'][ld['feature_alphanumid']] language = data['sailsLanguage'][ld['language_id']] id_ = '%s-%s' % (parameter.id, language.id) if not id_ in done: continue dt = (lgs[ld['language_id']], ld['language_id'], ld['feature_alphanumid'] + ". " + ld['feature_name'], ld["value"], ld["comment"]) cldf[dt] = None tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows]) savu(tab([("Language", "iso-639-3", "Feature", "Value", "Comment")] + cldf.keys()), "sails.cldf")
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ compute_language_sources() return from time import time _s = time() def checkpoint(s, msg=None): n = time() print(n - s, msg or '') return n sql = """ select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk """ datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)] _s = checkpoint(_s, '%s values loaded' % len(datatriples)) flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()]) _s = checkpoint(_s, 'triples grouped') clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")])) _s = checkpoint(_s, '%s clfps loaded' % len(clfps)) features = {f.id: f for f in DBSession.query(Feature)} for (f, lv) in flv.items(): features[f].representation = len(lv) DBSession.flush() _s = checkpoint(_s, 'representation assigned') families = {f.id: f for f in DBSession.query(Family)} if False: fs = feature_stability(datatriples, clfps) _s = checkpoint(_s, 'feature_stability computed') for (f, (s, transitions, stationarity_p, synchronic_p)) in fs: print(f) stability = Stability( id=f.replace("GB", "S"), feature=features[f], parsimony_stability_value=s["stability"], parsimony_retentions=s["retentions"], parsimony_transitions=s["transitions"], jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p}) DBSession.add(stability) for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions): DBSession.add(Transition( id="%s: %s->%s" % (f, fromnode, tonode), stability=stability, fromnode=get_name(fromnode), tonode=get_name(tonode), fromvalue=ft, tovalue=tt, family=families[fam], retention_innovation="Retention" if ft == tt else "Innovation")) DBSession.flush() _s = checkpoint(_s, 'stability and transitions loaded') imps = feature_dependencies(datatriples) _s = checkpoint(_s, 'feature_dependencies computed') if True: (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps]) _s = checkpoint(_s, 'dependencies_graph written') for (i, ((v, dstats), f1, f2)) in enumerate(imps): combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A" DBSession.add(Dependency( id="%s->%s" % (f1, f2), strength=v, feature1=features[f1], feature2=features[f2], representation=dstats["representation"], combinatory_status=combinatory_status, jsondata=dstats)) DBSession.flush() _s = checkpoint(_s, 'dependencies loaded') coordinates = { lg.id: (lg.longitude, lg.latitude) for lg in DBSession.query(common.Language) .filter(common.Language.longitude != None) .filter(common.Language.latitude != None)} deepfams = deep_families(datatriples, clfps, coordinates=coordinates) _s = checkpoint(_s, '%s deep_families computed' % len(deepfams)) missing_families = set() data = Data() for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams: dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2]) kmdistance = havdist(f1c, f2c) (f1lon, f1lat) = f1c if f1c else (None, None) (f2lon, f2lat) = f2c if f2c else (None, None) for li in [l1, l2]: if li not in families: missing_families.add(li) deepfam = DeepFamily( id=dname, support_value=support_value, significance=significance, family1=families.get(l1), family2=families.get(l2), family1_latitude = f1lat, family1_longitude = f1lon, family2_latitude = f2lat, family2_longitude = f2lon, geographic_plausibility = kmdistance) DBSession.add(deepfam) for (f, v1, v2, historical_score, independent_score, support_score) in supports: vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "") #vname = ("%s|%s" % (v1, v2)).replace(".", "") #print vid, vname if vid not in data["Support"]: data.add( Support, vid, id = vid, historical_score = historical_score, independent_score = independent_score, support_score = support_score, value1= v1, value2 = v2, feature=features[f]) DBSession.add(HasSupport( id=dname + "-" + vid, deepfamily = deepfam, support = data["Support"][vid])) print('missing_families:') print(missing_families) DBSession.flush() _s = checkpoint(_s, 'deep_families loaded') compute_language_sources()
def main(args): old_db = create_engine(DB) data = Data() # # migrate contributor table: complete # for row in old_db.execute("select * from contributor"): data.add( common.Contributor, row['id'], id=row['id'], name='%(firstname)s %(lastname)s' % row, url=row['homepage'], description=row['note'], email=row['email'], address=row['address']) data.add( common.Contributor, 'haspelmathmartin', id='haspelmathmartin', name="Martin Haspelmath", url="http://email.eva.mpg.de/~haspelmt/") DBSession.flush() dataset = common.Dataset( id='wold', name='WOLD', description='World Loanword Database', domain='wold.clld.org', published=date(2009, 8, 15), license='http://creativecommons.org/licenses/by/3.0/de/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by/3.0/de/88x31.png', 'license_name': 'Creative Commons Attribution 3.0 Germany License'}) DBSession.add(dataset) for i, editor in enumerate(['haspelmathmartin', 'tadmoruri']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) # # migrate semantic_field table: complete # for row in old_db.execute("select * from semantic_field"): if row['id'] != 25: kw = dict((key, row[key]) for key in ['id', 'name', 'description']) data.add(models.SemanticField, row['id'], **kw) # # migrate language table: complete # recipient flag is replaced by vocabulary_pk! # for row in old_db.execute("select * from language order by id"): kw = dict((key, row[key]) for key in [ 'fm_dl_id', 'name', 'latitude', 'longitude', 'wals_equivalent', 'affiliation', 'family', 'genus', 'countries']) data.add(models.WoldLanguage, row['id'], id=str(row['id']), **kw) # # migrate language_code table: complete # for row in old_db.execute("select * from language_code"): _id = '%(type)s-%(code)s' % row data.add(common.Identifier, _id, id=_id, type=row['type'], name=row['code']) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add(common.Identifier, gc, id=gc, type=common.IdentifierType.glottolog.value, name=gc) DBSession.flush() # # migrate language_code_language table: complete # for row in old_db.execute("select * from language_code_language"): _id = '%(type)s-%(code)s' % row data.add(common.LanguageIdentifier, '%s-%s' % (_id, row['language_id']), identifier_pk=data['Identifier'][_id].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add( common.LanguageIdentifier, '%s-%s' % (gc, row['language_id']), identifier_pk=data['Identifier'][gc].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) DBSession.flush() # # migrate vocabulary table: complete # for row in old_db.execute("select * from vocabulary order by id"): jsondata = {} for key in row.keys(): if key.startswith('fd_') or key in ['other_information', 'abbreviations']: jsondata[key] = row[key] vocab = data.add( models.Vocabulary, row['id'], id=str(row['id']), name=row['name'], color=row['color'], jsondata=jsondata) DBSession.flush() data['WoldLanguage'][row['language_id']].vocabulary_pk = vocab.pk DBSession.flush() # # migrate contact_situation and age tables: complete # contact situations and ages are unitdomainelements! # contact_situation = common.UnitParameter(id='cs', name='Contact Situation') age = common.UnitParameter(id='a', name='Age') DBSession.add(contact_situation) DBSession.add(age) DBSession.flush() for row in old_db.execute("select * from contact_situation"): if row['vocabulary_id'] is None: continue kw = dict((key, row[key]) for key in ['description', 'id', 'name']) kw['id'] = 'cs-%s' % kw['id'] p = data.add(models.WoldUnitDomainElement, row['id'], **kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = contact_situation.pk for row in old_db.execute("select * from age"): id_ = '%(vocabulary_id)s-%(label)s' % row kw = dict((key, row[key]) for key in ['start_year', 'end_year']) p = data.add(models.WoldUnitDomainElement, id_, id='a-%s' % id_, name=row['label'], description=row['description'], jsondata=kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = age.pk # # migrate meaning table: complete # for row in old_db.execute("select * from meaning"): kw = dict((key, row[key]) for key in [ 'description', 'core_list', 'ids_code', 'typical_context', 'semantic_category']) p = data.add( models.Meaning, row['id'], id=row['id'].replace('.', '-'), name=row['label'], sub_code=row['id'].split('.')[1] if '.' in row['id'] else '', semantic_field=data['SemanticField'][row['semantic_field_id']], **kw) DBSession.flush() for field in ['french', 'spanish', 'german', 'russian']: DBSession.add(models.Translation(name=row[field], lang=field, meaning=p)) for key in data['WoldLanguage']: lang = data['WoldLanguage'][key] data.add( common.ValueSet, '%s-%s' % (key, row['id']), id='%s-%s' % (key, row['id'].replace('.', '-')), language=lang, contribution=lang.vocabulary, parameter=p) DBSession.flush() # # migrate word table: # TODO: all the other word properties!! # fields = [ 'age_label', 'original_script', 'grammatical_info', 'comment_on_word_form', 'gloss', "comment_on_borrowed", "calqued", "borrowed_base", "numeric_frequency", "relative_frequency", "effect", "integration", "salience", "reference", "other_comments", "register", "loan_history", 'colonial_word', 'paraphrase_in_dutch', 'word_source', 'paraphrase_in_german', 'lexical_stratum', 'comparison_with_mandarin', 'year', 'comparison_with_korean', 'czech_translation', 'hungarian_translation', 'early_romani_reconstruction', 'etymological_note', 'boretzky_and_igla_etymology', 'manuss_et_al_etymology', 'vekerdi_etymology', 'turner_etymology', 'other_etymologies', 'mayrhofer_etymology', ] word_to_vocab = {} for row in old_db.execute("select * from word"): word_to_vocab[row['id']] = row['vocabulary_id'] kw = dict((key, row[key]) for key in ['id', 'age_score', 'borrowed', 'borrowed_score', 'analyzability', 'simplicity_score']) w = data.add( models.Word, row['id'], name=row['form'], description=row['free_meaning'], jsondata={k: row[k] for k in fields}, **kw) w.language = data['Vocabulary'][row['vocabulary_id']].language if row['age_label']: DBSession.add(common.UnitValue( id='%(id)s-a' % row, unit=w, unitparameter=age, unitdomainelement=data['WoldUnitDomainElement']['%(vocabulary_id)s-%(age_label)s' % row], contribution=data['Vocabulary'][row['vocabulary_id']])) if row['contact_situation_id'] and row['contact_situation_id'] != '9129144185487768': DBSession.add(common.UnitValue( id='%(id)s-cs' % row, unit=w, unitparameter=contact_situation, unitdomainelement=data['WoldUnitDomainElement'][row['contact_situation_id']], contribution=data['Vocabulary'][row['vocabulary_id']])) DBSession.flush() # # migrate word_meaning table: complete # for i, row in enumerate(old_db.execute("select * from word_meaning")): data.add( models.Counterpart, i, id=i, description='%(relationship)s (%(comment_on_relationship)s)' % row, name=data['Word'][row['word_id']].name, valueset=data['ValueSet']['%s-%s' % (word_to_vocab[row['word_id']], row['meaning_id'])], word = data['Word'][row['word_id']]) DBSession.flush() # # migrate vocabulary_contributor table: complete # for row in old_db.execute("select * from vocabulary_contributor"): DBSession.add(common.ContributionContributor( ord=row['ordinal'], primary=row['primary'], contributor_pk=data['Contributor'][row['contributor_id']].pk, contribution_pk=data['Vocabulary'][row['vocabulary_id']].pk)) DBSession.flush() # # source words: we have to make sure a word does only belong to one language. # thus, we have to reassign identifier! # # loop over source_word, source_word_donor_language pairs keeping track of source_word ids: known_ids = {} for row in old_db.execute("select sw.id, sw.meaning, sw.form, dl.language_id from source_word as sw, source_word_donor_language as dl where sw.id = dl.source_word_id"): if row['id'] in known_ids: # source_word was already seen associated to a different donor language! assert row['language_id'] not in known_ids[row['id']] known_ids[row['id']].append(row['language_id']) id_ = '%s-%s' % (row['id'], len(known_ids[row['id']])) else: id_ = '%s-%s' % (row['id'], 1) known_ids[row['id']] = [row['language_id']] new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) new.language = data['WoldLanguage'][row['language_id']] # source words may end up as words without language! for row in old_db.execute("select id, meaning, form from source_word where id not in (select source_word_id from source_word_donor_language)"): id_ = '%s-%s' % (row['id'], 1) new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) DBSession.flush() # # migrate word_source_word relations # TODO: should be modelled as UnitParameter! # j = 0 for row in old_db.execute("select * from word_source_word"): # there may be more than one word associated with a source_word_id (see above) source_words = [] for i in range(4): # but we guess no more than 4 :) id_ = '%s-%s' % (row['source_word_id'], i+1) if id_ in data['Word']: source_words.append(data['Word'][id_]) if not source_words: j += 1 #print(row['source_word_id']) #raise ValueError(row['source_word_id']) for sw in source_words: DBSession.add(models.Loan( source_word=sw, target_word=data['Word'][row['word_id']], relation=row['relationship'], certain=len(source_words) == 1)) print('%s source words not migrated because they have no donor language!' % j)
def main(args): data = Data() data_path = lambda *cs: args.data_file('concepticon-data', 'concepticondata', *cs) dataset = common.Dataset( id=concepticon.__name__, name="Concepticon 1.0", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='concepticon.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate(['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) english = data.add( common.Language, 'eng', id='eng', name='English') files = {} for fname in data_path('sources').iterdir(): files[fname.stem] = \ "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name for rec in Database.from_file( data_path('references', 'references.bib'), lowercase=True): source = data.add(common.Source, rec.id, _obj=bibtex2source(rec)) if rec.id in files: DBSession.flush() DBSession.add(common.Source_files( mime_type='application/pdf', object_pk=source.pk, jsondata=dict(url=files[rec.id]))) for concept in reader(data_path('concepticon.tsv'), namedtuples=True): data.add( models.ConceptSet, concept.ID, id=concept.ID, name=concept.GLOSS, description=concept.DEFINITION, semanticfield=concept.SEMANTICFIELD, ontological_category=concept.ONTOLOGICAL_CATEGORY) for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True): DBSession.add(models.Relation( source=data['ConceptSet'][rel.SOURCE], target=data['ConceptSet'][rel.TARGET], description=rel.RELATION)) unmapped = Counter() number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)') for cl in reader(data_path('conceptlists.tsv'), dicts=True): concepts = data_path('conceptlists', '%(ID)s.tsv' % cl) if not concepts.exists(): continue langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])] conceptlist = data.add( models.Conceptlist, cl['ID'], id=cl['ID'], name=' '.join(cl['ID'].split('-')), description=cl['NOTE'], target_languages=cl['TARGET_LANGUAGE'], source_languages=' '.join(langs), year=int(cl['YEAR']) if cl['YEAR'] else None, ) for id_ in split(cl['REFS']): common.ContributionReference( source=data['Source'][id_], contribution=conceptlist) for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')): name = strip_braces(name) contrib = data['Contributor'].get(name) if not contrib: contrib = data.add( common.Contributor, name, id=slug(name), name=name) DBSession.add(common.ContributionContributor( ord=i, contribution=conceptlist, contributor=contrib)) for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(): del cl[k] DBSession.flush() for k, v in cl.items(): DBSession.add(common.Contribution_data( object_pk=conceptlist.pk, key=k, value=v)) for concept in reader(concepts, namedtuples=True): if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN': #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None)) unmapped.update([conceptlist.id]) continue lgs = {} for lang in langs: v = getattr(concept, lang.upper()) if v: lgs[lang] = v match = number_pattern.match(concept.NUMBER) if not match: print(concept.ID) raise ValueError vs = common.ValueSet( id=concept.ID, description=getattr(concept, 'GLOSS', getattr(concept, 'ENGLISH', None)), language=english, contribution=conceptlist, parameter=data['ConceptSet'][concept.CONCEPTICON_ID]) d = {} for key, value in concept.__dict__.items(): if not key.startswith('CONCEPTICON_') and \ key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]: d[key.lower()] = value v = models.Concept( id=concept.ID, valueset=vs, description=getattr(concept, 'GLOSS', None), # our own gloss, if available name='; '.join('%s [%s]' % (lgs[l], l) for l in sorted(lgs.keys())), number=int(match.group('number')), number_suffix=match.group('suffix'), jsondata=d) DBSession.flush() for key, value in lgs.items(): DBSession.add( common.Value_data(key='lang_' + key, value=value, object_pk=v.pk)) print('Unmapped concepts:') for clid, no in unmapped.most_common(): print(clid, no) for fname in data_path('concept_set_meta').iterdir(): if fname.suffix == '.tsv': md = load(fname.parent.joinpath(fname.name + '-metadata.json')) provider = models.MetaProvider( id=fname.stem, name=md['dc:title'], description=md['dc:description'], url=md['dc:source'], jsondata=md) for meta in reader(fname, dicts=True): try: for k, v in meta.items(): if v and k != 'CONCEPTICON_ID': models.ConceptSetMeta( metaprovider=provider, conceptset=data['ConceptSet'][meta['CONCEPTICON_ID']], key=k, value=v) except: print(fname) print(meta) raise
def main(args): data = Data() dataset = common.Dataset( id=plld_app.__name__, name=plld_app.__name__, domain='plld.clld.org', description="Database of Papuan Language and Culture", ) DBSession.add(dataset) # Load the list of languages languages = pandas.ExcelFile( os.path.join(DBPATH, "Languages and Coordinates.xlsx")).parse(0) parameters = {} for i, language in languages.iterrows(): # Generate the database object for each language print("\nCreating language", language['Language name (-dialect)']) lang = models.Lect( id=((language['Language name (-dialect)'].lower()[:4] + "x" + newid()) if pandas.isnull(language['Glottolog']) else language['Glottolog'].strip()), region=language['Region'], family=language['Family'], name=language['Language name (-dialect)'].strip(), latitude=language['Lat'], longitude=language['Lon']) # Check what data files we have that say they are about that language. if pandas.isnull(language['ISO_code']): # The convention for file-names of varieties without iso # code is ad-hoc, skip those until we have established a # good convention. files_concerning = [ file for file in os.listdir(DBPATH) if file.lower().startswith(language['Internal'].lower() + '_') ] else: # Otherwise, the convention is that languages are # described by files starting with their iso code and an # underscore. files_concerning = [ file for file in os.listdir(DBPATH) if file.lower().startswith(language['ISO_code'].lower() + '_') ] # For each such language, we might have typological, sociolinguistic and lexical (small vocabulary or big vocabulary or kinship terms) data. Deal with them in order. # Try to load the corresponding typology questionnaire typology_files = [ file for file in files_concerning if 'typolog' in file.lower() ] if len(typology_files) == 1: try: add_typological_data(typology_files[0], parameters, lang) print("Typological data read.") except UnexpectedTypologyFormatError: print( "File", typology_files[0], "had an unexpected format for a typology questionnaire!") else: print("There were not one, but", len(typology_files), "possible questionnaires.") # Try to load the corresponding cultural features questionnaire culture_files = [ file for file in files_concerning if 'cult' in file.lower() ] if len(culture_files) == 1: try: add_cultural_data(culture_files[0], parameters, lang) print("Cultural data read.") except UnexpectedCultureFormatError: print("File", culture_files[0], "had an unexpected format for a culture questionnaire!") else: print("There were not one, but", len(culture_files), "possible questionnaires.")
if len(res) > 100: break return res.strip() def get_vs2008(args): # pragma: no cover vs2008 = {} for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','): vs2008[(row[0], '%sA' % row[1])] = int(row[2]) return vs2008 E2008 = utc.localize(datetime(2008, 4, 21)) E2011 = utc.localize(datetime(2011, 4, 28)) E2013 = utc.localize(datetime(2013, 11, 15)) data = Data(created=E2008, updated=E2008) def migrate(from_, to_, converter): # pragma: no cover for row in DB.execute("select * from %s" % from_): res = converter(row) if not res: continue if isinstance(res, dict): DBSession.add(to_(**res)) else: data.add(to_, res[0], **res[1]) DBSession.flush() def main(args): # pragma: no cover
def main(args): # # order of init: # - villages # - files # - movies # videos = defaultdict(list) for f in util.iter_files(args): obj = models.File(**attr.asdict(f)) if obj.mime_type.startswith('video'): videos[slug(obj.name.split('.')[0])].append(obj) DBSession.add(obj) lexicon = list(util.iter_lexicon(args)) villages = util.get_villages(args) ff_images = list(util.ff_images(args)) bib = list(util.get_bib(args)) data = Data() dataset = common.Dataset( id=dogonlanguages.__name__, name="Dogon and Bangime Linguistics", contact="*****@*****.**", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='dogonlanguages.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) if Glottolog: if socket.gethostname() == 'dlt5502178l': glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) else: glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} else: languoids = {} print('got glottolog') for c in util.CONTRIBUTORS: id_ = slug(c.name.split()[-1]) data.add(models.Member, id_, id=id_, **attr.asdict(c)) data.add( models.Member, 'forkel', id='forkel', name='Robert Forkel', email='*****@*****.**', in_project=False) for i, id_ in enumerate(['moran', 'forkel', 'heath']): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=data['Member'][id_])) contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages') for doc in bib: obj = data.add( models.Document, doc.rec.id, _obj=bibtex2source(doc.rec, cls=models.Document)) keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')]) for dt in 'grammar lexicon typology texts'.split(): if dt in keywords: obj.doctype = dt break obj.project_doc = ('DLP' in keywords) or bool(doc.files) if obj.project_doc: for i, cid in enumerate(util.get_contributors(doc.rec, data)): models.DocumentContributor( document=obj, contributor=data['Member'][cid], ord=i) for i, (path, cdstar) in enumerate(doc.files): common.Source_files( id='%s-%s' % (obj.id, i + 1), name=path, object=obj, mime_type=guess_type(path)[0], jsondata=cdstar, ) print('got bib') for name, (gc, desc) in LANGUAGES.items(): gl_lang = languoids[gc] lat, lon = gl_lang.latitude, gl_lang.longitude lang = data.add( models.Languoid, gc, id=gc, name=name, description=desc, latitude=lat, longitude=lon, family=gl_lang.family.name if gl_lang and gl_lang.family else name, ) if name == 'Penange' and lang.longitude > 0: lang.longitude = -lang.longitude if name == 'Bankan Tey': lang.latitude, lang.longitude = 15.07, -2.91 if name == 'Ben Tey': lang.latitude, lang.longitude = 14.85, -2.95 if name == 'Togo Kan': lang.latitude, lang.longitude = 14.00, -3.25 add_language_codes(data, lang, gl_lang.iso, glottocode=gc) villages_by_name = defaultdict(list) contrib_by_initial = {c.abbr: c for c in data['Member'].values()} for i, village in enumerate(villages): lang = None if village.glottocode: lang = data['Languoid'].get(village.glottocode) if not lang: gl_lang = languoids[village.glottocode] lang = data.add( models.Languoid, gl_lang.id, id=gl_lang.id, name=gl_lang.name, in_project=False, family=gl_lang.family.name if gl_lang.family else gl_lang.name) v = data.add( models.Village, str(i + 1), id=str(i + 1), name=village.name, description=village.data.pop('social info'), surnames=village.data.pop('surnames'), major_city=village.data['MajorCity'] == 'Y', transcribed_name=village.data.pop('Transcribed Village Name'), source_of_coordinates=village.data.pop('sourceOfCoordinates'), latitude=village.lat, longitude=village.lon, languoid=lang, jsondata=village.data, ) villages_by_name[village.name].append(v) for img in village.images: mimetype = guess_type(img.name)[0] if mimetype: f = models.Village_files( id=img.id, name=img.name, description=img.description, date_created=img.date, latitude=img.coords[0] if img.coords else None, longitude=-img.coords[1] if img.coords else None, object=v, mime_type=mimetype, jsondata=img.cdstar, ) for initial in img.creators: if initial in contrib_by_initial: models.Fotographer( foto=f, contributor=contrib_by_initial[initial]) for cat, desc, place, name in MOVIES: s = slug(name) m = models.Movie( id=s, name=desc, description=cat, place=place, ) if place in villages_by_name and len(villages_by_name[place]) == 1: m.village = villages_by_name[place][0] #print('found village: %s' % name) for v in videos[s]: #print('found video: %s' % name) v.movie = m m.duration = v.duration names = defaultdict(int) for concept in lexicon: add(concept, data, names, contrib) count = set() for img in ff_images: if img.id in count: continue count.add(img.id) if img.ref: if img.ref in data['Concept']: concept = data['Concept'][img.ref] if img.tsammalex_taxon and not concept.tsammalex_taxon: concept.tsammalex_taxon = img.tsammalex_taxon #print(concept.tsammalex_taxon) common.Parameter_files( object=concept, id=img.id, name=img.name.decode('utf8'), mime_type=guess_type(img.name)[0], jsondata=img.cdstar) else: print('missing ref: %s' % img.ref)
def load(args): glottolog = args.repos fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") version = assert_release(glottolog.repos) dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(version), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def main(args): fts.index('fts_index', Word.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") data = Data() dataset = common.Dataset( id=dictionaria.__name__, name="Dictionaria", description="The Dictionary Journal", published=date(2017, 3, 30), contact='*****@*****.**', domain='dictionaria.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) for i, (id_, name) in enumerate([ ('haspelmathmartin', 'Martin Haspelmath'), ('moselulrike', 'Ulrike Mosel'), ('stiebelsbarbara', 'Barbara Stiebels') ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) comparison_meanings = {} print('loading concepts ...') glosses = set() concepticon = Concepticon( REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data')) if not args.no_concepts: for conceptset in concepticon.conceptsets.values(): if conceptset.gloss in glosses: continue glosses.add(conceptset.gloss) cm = data.add( ComparisonMeaning, conceptset.id, id=conceptset.id, name=conceptset.gloss.lower(), description=conceptset.definition, concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id) comparison_meanings[cm.id] = cm DBSession.flush() print('... done') comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()} submissions = [] for submission in REPOS.joinpath( 'submissions-internal' if args.internal else 'submissions').glob('*'): if not submission.is_dir(): continue try: submission = Submission(submission) except ValueError: continue md = submission.md if md is None: continue if not md['date_published']: continue id_ = submission.id if args.dict and args.dict != id_ and args.dict != 'all': continue lmd = md['language'] props = md.get('properties', {}) props.setdefault('custom_fields', []) props['metalanguage_styles'] = {} for v, s in zip(props.get('metalanguages', {}).values(), ['success', 'info', 'warning', 'important']): props['metalanguage_styles'][v] = s props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f for f in props['custom_fields']] language = data['Variety'].get(lmd['glottocode']) if not language: language = data.add( Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name']) md['date_published'] = md['date_published'] or date.today().isoformat() if '-' not in md['date_published']: md['date_published'] = md['date_published'] + '-01-01' dictionary = data.add( Dictionary, id_, id=id_, number=md.get('number'), name=props.get('title', lmd['name'] + ' dictionary'), description=submission.description, language=language, published=date(*map(int, md['date_published'].split('-'))), jsondata=props) for i, spec in enumerate(md['authors']): if not isinstance(spec, dict): cname, address = spec, None spec = {} else: cname, address = spec['name'], spec.get('affiliation') name = HumanName(cname) cid = slug('%s%s' % (name.last, name.first)) contrib = data['Contributor'].get(cid) if not contrib: contrib = data.add( common.Contributor, cid, id=cid, name=cname, address=address, url=spec.get('url'), email=spec.get('email')) DBSession.add(common.ContributionContributor( ord=i + 1, primary=True, contributor=contrib, contribution=dictionary)) submissions.append((dictionary.id, language.id, submission)) transaction.commit() for did, lid, submission in submissions: #if submission.id != 'sidaama': # continue transaction.begin() print('loading %s ...' % submission.id) dictdata = Data() lang = Variety.get(lid) submission.load_examples(Dictionary.get(did), dictdata, lang) submission.dictionary.load( submission, dictdata, Dictionary.get(did), lang, comparison_meanings, OrderedDict(submission.md.get('properties', {}).get('labels', []))) transaction.commit() print('... done') transaction.begin() load_families( Data(), [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)], glottolog_repos='../../glottolog3/glottolog')
def load(args): fts.index('fts_index', models.Ref.fts, DBSession.bind) DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;") dataset = common.Dataset( id='glottolog', name="Glottolog {0}".format(args.args[0]), publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='glottolog.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data = Data() for i, (id_, name) in enumerate([ ('hammarstroem', 'Harald Hammarström'), ('bank', 'Sebastian Bank'), ('forkel', 'Robert Forkel'), ('haspelmath', 'Martin Haspelmath'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) clf = data.add(common.Contribution, 'clf', id='clf', name='Classification') DBSession.add(common.ContributionContributor( contribution=clf, contributor=data['Contributor']['hammarstroem'])) for pid, pname in [ ('fc', 'Family classification'), ('sc', 'Subclassification'), ('vitality', 'Degree of endangerment'), ]: data.add(common.Parameter, pid, id=pid, name=pname) legacy = jsonlib.load(gc2version(args)) for gc, version in legacy.items(): data.add(models.LegacyCode, gc, id=gc, version=version) glottolog = args.repos for ma in Macroarea: data.add( models.Macroarea, ma.name, id=ma.name, name=ma.value, description=ma.description) for country in glottolog.countries: data.add(models.Country, country.id, id=country.id, name=country.name) lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list) languoids = list(glottolog.languoids()) nodemap = {l.id: l for l in languoids} for lang in languoids: for ref in lang.sources: lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id) load_languoid(data, lang, nodemap) mas[lang.id] = [ma.name for ma in lang.macroareas] countries[lang.id] = [c.id for c in lang.countries] lgcodes[lang.id] = lang.id if lang.hid: lgcodes[lang.hid] = lang.id if lang.iso: lgcodes[lang.iso] = lang.id for gc in glottolog.glottocodes: if gc not in data['Languoid'] and gc not in legacy: common.Config.add_replacement(gc, None, model=common.Language) for obj in jsonlib.load(glottolog.references_path('replacements.json')): common.Config.add_replacement( '{0}'.format(obj['id']), '{0}'.format(obj['replacement']) if obj['replacement'] else None, model=common.Source) DBSession.flush() for lid, maids in mas.items(): for ma in maids: DBSession.add(models.Languoidmacroarea( languoid_pk=data['Languoid'][lid].pk, macroarea_pk=data['Macroarea'][ma].pk)) for lid, cids in countries.items(): for cid in cids: DBSession.add(models.Languoidcountry( languoid_pk=data['Languoid'][lid].pk, country_pk=data['Country'][cid].pk)) for doctype in glottolog.hhtypes: data.add( models.Doctype, doctype.id, id=doctype.id, name=doctype.name, description=doctype.description, abbr=doctype.abbv, ord=doctype.rank) for bib in glottolog.bibfiles: data.add( models.Provider, bib.id, id=bib.id, name=bib.title, description=bib.description, abbr=bib.abbr, url=bib.url) DBSession.flush() s = time() for i, entry in enumerate( BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()): if i % 10000 == 0: args.log.info('{0}: {1:.3}'.format(i, time() - s)) s = time() ref = load_ref(data, entry, lgcodes, lgsources) if 'macro_area' in entry.fields: for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True): ma = 'North America' if ma == 'Middle America' else ma ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma) DBSession.add(models.Refmacroarea( ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
def test_from_csv(self): from tsammalex import models as m data = Data() data.add(common.Dataset, 'tsammalex', id='tsammalex') data.add(common.Contribution, 'tsammalex', id='tsammalex') data.add(m.Ecoregion, 'AT1309', id='AT1309') data.add(m.Bibrec, 'ref1', id='ref1') data.add(m.Bibrec, 'ref2', id='ref2') for name, cls in [ ('contributors', m.TsammalexContributor), ('taxa', m.Taxon), ('lineages', m.Lineage), ('languages', m.Languoid), ('categories', m.Category), ('names', m.Name) ]: for row in DATA[name]: obj = cls.from_csv(row.split(','), data=data) data.add(cls, obj.id, _obj=obj)
def main(args): """Fills the database with data retrieved from tabular files. 'filltables()' iterates over each row of each table. 'typ' is the name of the table, 'name' a key column and 'tupl' the other columns as a dict {column_name,cell_value}.""" data = Data() count = 0 # dataset dataset = _addDataset(data) # load languages for typ, name, tupl in filltables(): if not name or name == "na": continue #TODO we exclude non core language if typ == "languages" and tupl.get('Id').startswith('L_'): #print(name, tupl) lang = _addLang([ name, tupl.get('Language', "na"), tupl.get('Family', "na"), tupl.get('fam_glottocode', ""), tupl.get('Area', "na"), tupl.get('Creator', "na"), tupl.get('Date', "na"), tupl.get('Archive', "na"), tupl.get('Archive_link', "na"), tupl.get('Translation', "na"), tupl.get('License', "na"), tupl.get('Audio license', "na"), tupl.get('NAKALA', "na"), tupl.get('Gloss', "na"), tupl.get('Words', 0), tupl.get('Speakers', 0), tupl.get('Texts', 0), tupl.get('Core words', 0), tupl.get('Core speakers', 0), tupl.get('Core texts', 0), tupl.get('Latitude', 0.0), tupl.get('Longitude', 0.0), tupl.get('Extended', "no") ]) add_language_codes(data, lang, tupl.get('iso-639-3'), glottocode=name) elif typ == "editors": dataset, count = _addEditor(dataset, count, [ name, tupl.get('url', "na"), tupl.get('email', "na"), tupl.get('address', "na"), tupl.get('team', "na"), tupl.get('function', "na") ]) elif typ == "sources": _addSource([ name, tupl.get('bibtex_type', "na"), tupl.get('author', "na"), tupl.get('year', "na"), tupl.get('title', "na"), tupl.get('url', "na"), tupl.get('note', "na") ]) else: #TODO for texts, we exclude delete and so on in column extended if tupl.get('extended') in ['no', 'yes']: #if typ=='dolg1241' : print(tupl):'' _addText([ typ, name, tupl.get('name', "na"), tupl.get('spk_code', "na"), tupl.get('spk_age', '0'), tupl.get('spk_age_c', "na"), tupl.get('spk_sex', "na"), tupl.get('rec_date', "na"), tupl.get('rec_date_c', "na"), tupl.get('genre', "na"), tupl.get('genre_stim', "na"), tupl.get('gloss', "na"), tupl.get('transl', "na"), tupl.get('sound', "na"), tupl.get('overlap', "na"), tupl.get('processed', "na"), tupl.get('nakala', "na"), tupl.get('words', 0), tupl.get('extended', "no") ]) # dataset # Note: needs to run after loading (for editors) DBSession.add(dataset) DBSession.flush()
def main(args): data = Data() dataset = common.Dataset( id=amsd.__name__, name="AMSD", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='amsd.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) editors = OrderedDict([('Piers Kelly', None)]) # data_entry => Contributor for row in sorted(dicts('data_entry'), key=lambda x: [ x['name'].lower()] ): if row['name'] in editors: editors[row['name']] = row['pk'] data.add( common.Contributor, row['pk'], id=row['pk'], name=row['name'] ) for i, cid in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1) for row in dicts('source_citation'): data.add( common.Source, row['pk'], id=row['pk'], note=row['name'], name=row['name'], ) for row in dicts('ling_area'): data.add( models.ling_area, row['pk'], chirila_name = row['chirila_name'], austlang_code = row['austlang_code'], austlang_name = row['austlang_name'], glottolog_code = row['glottolog_code'], ) fd = {} for row in dicts('linked_filenames'): if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']: fd[row['pk']] = dict( name = row['name'], oid = row['oid'], path = row['path'], mimetype = mimetypes.guess_type(row['path'])[0], ) for m in 'item_type technique keywords material source_type sem_domain holder_file'.split(): for row in dicts(m): data.add( getattr(models, m), row['pk'], name = row['name'], ) DBSession.flush() # sticks => MessageStick no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type', 'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry', 'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file'] x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords', 'holder_file', 'item_type'] for i, row in enumerate(dicts('sticks')): fts_items = [] for col in row.keys(): if col: if col == 'amsd_id': fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),) elif col not in no_fts_cols and not col.endswith('_pk'): fts_items.append(row[col]) for t in x_cols: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(str(data[t][k])) fts_items.extend(str(data[t][k]).split('_')) for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']: if row[t]: for _, k in enumerate(row[t].split(';')): fts_items.append(data['ling_area'][k].chirila_name) fts_items.append(data['ling_area'][k].austlang_code) fts_items.append(data['ling_area'][k].austlang_name) fts_items.append(data['ling_area'][k].glottolog_code) if row['source_citation']: for k in row['source_citation'].split(';'): data.add( common.ContributionReference, k, contribution_pk = int(row['pk']), source_pk = int(k), ) fts_items.append(str(data['Source'][k])) if row['linked_filenames']: for j, k in enumerate(row['linked_filenames'].split(';')): if k in fd: oid = fd[k].get('oid') mt = fd[k].get('mimetype') refobjid = '' if mt == 'application/pdf': refobjid = oid # use for web, thumbnail a place holder image oid = 'EAEA0-52CC-0295-6B71-0' n = fd[k].get('name') data.add( common.Contribution_files, k, id='%s-%s-%i' % (k, row['pk'], j), object_pk = int(row['pk']), name = n, jsondata = dict( original = fd[k].get('path'), objid = oid, refobjid = refobjid, web = 'web.jpg', thumbnail = 'thumbnail.jpg', ), ord=j, mime_type = mt, ) fts_items.append(n) fts_items.extend(nfilter(re.split('[_\-\.]', n))) data.add( models.MessageStick, row['pk'], id = row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i), title = row['title'], description = row['description'], obj_creator = row['obj_creator'], date_created = row['date_created'], note_place_created = row['note_place_created'], place_created = row['place_created'], item_type_pk = row['item_type'] or None, ling_area_1_pk = row['ling_area_1'] or None, ling_area_2_pk = row['ling_area_2'] or None, ling_area_3_pk = row['ling_area_3'] or None, notes_ling_area = row['notes_ling_area'], stick_term = row['stick_term'], message = row['message'], motifs = row['motifs'], motif_transcription = row['motif_transcription'], dim_1 = row['dim_1'], dim_2 = row['dim_2'], dim_3 = row['dim_3'], date_collected = row['date_collected'], holder_file_pk = row['holder_file'] or None, holder_obj_id = row['holder_obj_id'], collector = row['collector'], place_collected = row['place_collected'], creator_copyright = row['creator_copyright'], file_copyright = row['file_copyright'], latitude = row['lat'] or None, longitude = row['long'] or None, notes_coords = row['notes_coords'], url_institution = row['url_institution'], url_source_1 = row['url_source_1'], url_source_2 = row['url_source_2'], irn = row['irn'], notes = row['notes'], data_entry = row['data_entry'], fts = fts.tsvector('\n'.join(re.sub('[_\-]','.',v) for v in fts_items)), ) DBSession.flush() for row in dicts('sticks'): for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']: if row[t]: for _, k in enumerate(row[t].split(';')): data.add( getattr(models, 'x_%s' % (t)), k, object_pk = int(row['pk']), item_pk = int(k), )
def load( self, submission, did, lid, comparison_meanings, comparison_meanings_alt_labels, marker_map): data = Data() rel = [] vocab = models.Dictionary.get(did) lang = models.Variety.get(lid) for ex in Examples.from_file(self.dir.joinpath('examples.sfm')): data.add( common.Sentence, ex.id, id=ex.id, name=ex.text, language=lang, analyzed=ex.morphemes, gloss=ex.gloss, description=ex.translation) for i, entry in enumerate(self.sfm): words = list(entry.get_words()) headword = None for j, word in enumerate(words): if not word.meanings: print('no meanings for word %s' % word.form) continue if not headword: headword = word.id else: rel.append((word.id, 'sub', headword)) for tw in word.rel: rel.append((word.id, tw[0], tw[1])) w = data.add( models.Word, word.id, id='%s-%s-%s' % (submission.id, i + 1, j + 1), name=word.form, number=int(word.hm) if word.hm else 0, phonetic=word.ph, pos=word.ps, dictionary=vocab, language=lang) DBSession.flush() concepts = [] for k, meaning in enumerate(word.meanings): if not (meaning.ge or meaning.de): print('meaning without description for word %s' % w.name) continue if meaning.ge: meaning.ge = meaning.ge.replace('.', ' ') m = models.Meaning( id='%s-%s' % (w.id, k + 1), name=meaning.de or meaning.ge, description=meaning.de, gloss=meaning.ge, word=w, semantic_domain=', '.join(meaning.sd)) assert not meaning.x for xref in meaning.xref: s = data['Sentence'].get(xref) assert s models.MeaningSentence(meaning=m, sentence=s) key = (meaning.ge or meaning.de).replace('.', ' ').lower() concept = None if key in comparison_meanings: concept = comparison_meanings[key] elif key in comparison_meanings_alt_labels: concept = comparison_meanings_alt_labels[key] if concept and concept not in concepts: concepts.append(concept) vsid = '%s-%s' % (key, submission.id), if vsid in data['ValueSet']: vs = data['ValueSet'][vsid] else: vs = data.add( common.ValueSet, vsid, id='%s-%s' % (submission.id, m.id), language=lang, contribution=vocab, parameter_pk=concept) DBSession.add(models.Counterpart( id='%s-%s' % (w.id, k + 1), name=w.name, valueset=vs, word=w)) for _lang, meanings in word.non_english_meanings.items(): assert _lang in submission.md['metalanguages'] for meaning in meanings: k += 1 models.Meaning( id='%s-%s' % (w.id, k + 1), name=meaning, gloss=meaning, language=submission.md['metalanguages'][_lang], word=w) for index, (key, values) in enumerate(word.data.items()): if key in marker_map: label = marker_map[key] converter = default_value_converter if isinstance(label, (list, tuple)): label, converter = label for value in values: DBSession.add(common.Unit_data( object_pk=w.pk, key=label, value=converter(value, word.data), ord=index)) # FIXME: vgroup words by description and add synonym relationships! for s, d, t in rel: if s in data['Word'] and t in data['Word']: DBSession.add(models.SeeAlso( source_pk=data['Word'][s].pk, target_pk=data['Word'][t].pk, description=d)) else: print('---m---', s if s not in data['Word'] else t)
def main(args): data = Data() editors = OrderedDict() editors['Susanne Maria Michaelis'] = None editors['Philippe Maurer'] = None editors['Martin Haspelmath'] = None editors['Magnus Huber'] = None for row in read(args, 'People'): name = row['First name'] + ' ' if row['First name'] else '' name += row['Last name'] kw = dict( name=name, id=slug('%(Last name)s%(First name)s' % row), url=row['Contact Website'].split()[0] if row['Contact Website'] else None, address=row['Comments on database'], ) contrib = data.add(common.Contributor, row['Author ID'], **kw) if kw['name'] in editors: editors[kw['name']] = contrib DBSession.flush() dataset = common.Dataset( id='apics', name='APiCS Online', description='Atlas of Pidgin and Creole Language Structures Online', domain='apics-online.info', published=date(2013, 11, 4), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License' }) DBSession.add(dataset) for i, editor in enumerate(editors.values()): common.Editor(dataset=dataset, contributor=editor, ord=i + 1) colors = dict( (row['ID'], row['RGB_code']) for row in read(args, 'Colours')) abbrs = {} for id_, name in LGR_ABBRS.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for id_, name in { 'C**T': 'clitic', 'IMPF': 'imperfect', 'INTERM': 'intermediate', 'NCOMPL': 'noncompletive', 'NONFUT': 'nonfuture', 'NPROX': 'nonproximal', 'NSG': 'nonsingular', 'PP': 'past participle', 'PROP': 'proprietive', 'TMA': 'tense-mood-aspect', }.items(): DBSession.add(common.GlossAbbreviation(id=id_, name=name)) abbrs[id_] = 1 for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'), delimiter=',', namedtuples=True): for match in GLOSS_ABBR_PATTERN.finditer(row.standard): if match.group('abbr') not in abbrs: abbrs[match.group('abbr')] = 1 DBSession.add( common.GlossAbbreviation(id=match.group('abbr'), name=row.meaning)) non_bibs = {} for row in read(args, 'References', 'Reference_ID'): if row['Reference_type'] == 'Non-bib': non_bibs[row['Reference_ID']] = row['Reference_name'] continue if isinstance(row['Year'], int): year_int = row['Year'] year = str(row['Year']) elif row['Year']: year_int = None for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']): year_int = int(m.group('year')) break year = row['Year'] else: year, year_int = None, None title = row['Article_title'] or row['Book_title'] attrs = {} jsondata = {} for attr, field in { 'Additional_information': 'note', 'Article_title': 'title', 'Book_title': 'booktitle', 'City': 'address', 'Editors': 'editor', 'Full_reference': None, 'Issue': None, 'Journal': 'journal', 'Language_codes': None, 'LaTeX_cite_key': None, 'Pages': 'pages', 'Publisher': 'publisher', 'Reference_type': 'type', 'School': 'school', 'Series_title': 'series', 'URL': 'url', 'Volume': 'volume', }.items(): value = row.get(attr) if not isinstance(value, int): value = (value or '').strip() if attr == 'Issue' and value: try: value = str(int(value)) except ValueError: pass if value: if field: attrs[field] = value else: jsondata[attr] = value p = data.add(common.Source, row['Reference_ID'], id=str(row['Reference_ID']), name=row['Reference_name'], description=title, author=row['Authors'], year=year, year_int=year_int, bibtex_type=getattr(EntryType, row['BibTeX_type'] or 'misc'), jsondata=jsondata, **attrs) if p.bibtex_type.value == 'misc' and not p.description: p.description = p.note DBSession.flush() DBSession.flush() infobox = jsonload(args.data_file('infobox.json')) glottocodes = jsonload(args.data_file('glottocodes.json')) for row in read(args, 'Languages', 'Order_number'): lon, lat = [ float(c.strip()) for c in row['map_coordinates'].split(',') ] kw = dict( name=row['Language_name'], id=str(row['Order_number']), latitude=lat, longitude=lon, region=row['Category_region'], ) lect = data.add(models.Lect, row['Language_ID'], **kw) DBSession.flush() for i, item in enumerate(infobox[lect.id]): DBSession.add( common.Language_data(object_pk=lect.pk, ord=i, key=item[0], value=item[1])) if row["Languages_contribution_documentation::Lect_description_checked_status"] \ != "Checked": print 'unchecked! ---', row['Language_name'] desc = row.get( 'Languages_contribution_documentation::Lect description', '') markup_desc = normalize_markup(row[ 'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description'] ) c = data.add( models.ApicsContribution, row['Language_ID'], id=str(row['Order_number']), name=row['Language_name'], description=desc, markup_description=markup_desc, survey_reference=data['Source'][row['Survey_reference_ID']], language=lect) for ext, label, mtype in [ ('pdf', 'Glossed text', 'application/pdf'), ('mp3', 'Glossed text audio', 'audio/mpeg'), ]: fid = '%s-gt.%s' % (c.id, ext) if args.data_file('files', 'contribution', c.id, fid).exists(): common.Contribution_files(object=c, id=fid, name=label, mime_type=mtype) else: print label, 'missing for:', row['Language_name'] # # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE # iso = None if row['ISO_code'] and len(row['ISO_code']) == 3: iso = row['ISO_code'].lower() if 'iso:%s' % row['ISO_code'] not in data['Identifier']: data.add(common.Identifier, 'iso:%s' % row['ISO_code'], id=row['ISO_code'].lower(), name=row['ISO_code'].lower(), type=common.IdentifierType.iso.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier']['iso:%s' % row['ISO_code']])) if lect.id in glottocodes: identifier = data.add(common.Identifier, 'gc:%s' % glottocodes[lect.id], id=glottocodes[lect.id], name=glottocodes[lect.id], type=common.IdentifierType.glottolog.value) DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=identifier)) if row['Language_name_ethnologue']: if row['Language_name_ethnologue'] not in data['Identifier']: data.add(common.Identifier, row['Language_name_ethnologue'], id=iso or 'ethnologue:%s' % row['Language_name_ethnologue'], name=row['Language_name_ethnologue'], type='ethnologue') DBSession.add( common.LanguageIdentifier( language=data['Lect'][row['Language_ID']], identifier=data['Identifier'][ row['Language_name_ethnologue']])) example_count = {} for row in read(args, 'Examples', 'Order_number'): assert row['Language_ID'] lang = data['Lect'][row['Language_ID']] id_ = '%(Language_ID)s-%(Example_number)s' % row atext, gloss = igt(row) example_count[row['Language_ID']] = max( [example_count.get(row['Language_ID'], 1), row['Example_number']]) p = add_sentence( args, data, id_, id='%s-%s' % (lang.id, row['Example_number']), name=row['Text'] or row['Analyzed_text'], description=row['Translation'], type=row['Type'].strip().lower() if row['Type'] else None, comment=row['Comments'], gloss=gloss, analyzed=atext, markup_text=normalize_markup(row['z_calc_Text_CSS']), markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']), markup_comment=normalize_markup(row['z_calc_Comments_CSS']), markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']), original_script=row['Original_script'], jsondata={ 'sort': row['Order_number'], 'alt_translation': (row['Translation_other'] or '').strip() or None }, language=lang) if row['Reference_ID']: if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.SentenceReference( sentence=p, source=source, key=source.id, description=row['Reference_pages'])) else: p.source = non_bibs[row['Reference_ID']] DBSession.flush() for row in read(args, 'Language_references'): if row['Reference_ID'] not in data['Source']: assert row['Reference_ID'] in non_bibs continue assert row['Language_ID'] in data['ApicsContribution'] source = data['Source'][row['Reference_ID']] DBSession.add( common.ContributionReference( contribution=data['ApicsContribution'][row['Language_ID']], source=source, description=row['Pages'], key=source.id)) # # global counter for features - across feature types # feature_count = 0 for row in read(args, 'Features', 'Feature_number'): id_ = str(row['Feature_number']) if int(id_) > feature_count: feature_count = int(id_) wals_id = None desc = row['Feature_annotation_publication'] if row['WALS_match'] == 'Total': if isinstance(row['WALS_No.'], int): wals_id = row['WALS_No.'] else: wals_id = int(row['WALS_No.'].split('.')[0].strip()) p = data.add(models.Feature, row['Feature_code'], name=row['Feature_name'], id=id_, description=desc, markup_description=normalize_markup( row['z_calc_Feature_annotation_publication_CSS']), feature_type='primary', multivalued=row['Value_relation_type'] != 'Single', area=row['Feature_area'], wals_id=wals_id) names = {} for i in range(1, 10): if not row['Value%s_publication' % i] \ or not row['Value%s_publication' % i].strip(): continue name = row['Value%s_publication' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 de = data.add( common.DomainElement, '%s-%s' % (row['Feature_code'], i), id='%s-%s' % (id_, i), name=name, parameter=p, abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name, number=int(row['Value%s_value_number_for_publication' % i]), jsondata={'color': colors[row['Value_%s_colour_ID' % i]]}, ) assert de if row['Authors_FeatureArticles']: authors, _ = row['Authors_FeatureArticles'].split('and the APiCS') authors = authors.strip() if authors.endswith(','): authors = authors[:-1].strip() for i, name in enumerate(authors.split(',')): assert name.strip() in editors p._authors.append( models.FeatureAuthor(ord=i + 1, contributor=editors[name.strip()])) DBSession.flush() primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41} segment_to_primary = dict( zip(primary_to_segment.values(), primary_to_segment.keys())) number_map = {} names = {} for row in read(args, 'Segment_features', 'Order_number'): symbol = row['Segment_symbol'] if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate': symbol = 't\u0361s' truth = lambda s: s and s.strip().lower() == 'yes' name = '%s - %s' % (symbol, row['Segment_name']) if name in names: number_map[row['Segment_feature_number']] = names[name] continue number_map[ row['Segment_feature_number']] = row['Segment_feature_number'] names[name] = row['Segment_feature_number'] feature_count += 1 if row['Segment_feature_number'] in segment_to_primary: primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\ = str(feature_count) p = data.add(models.Feature, row['Segment_feature_number'], name=name, id=str(feature_count), feature_type='segment', area='Vowels' if truth(row['Vowel']) else ('Obstruent consonants' if truth(row['Obstruent']) else 'Sonorant consonants'), jsondata=dict( number=int(row['Segment_feature_number']), vowel=truth(row['Vowel']), consonant=truth(row['Consonant']), obstruent=truth(row['Obstruent']), core_list=truth(row['Core_list_segment']), symbol=symbol, )) for i, spec in SEGMENT_VALUES.items(): data.add(common.DomainElement, '%s-%s' % (row['Segment_feature_number'], spec[0]), id='%s-%s' % (p.id, i), name=spec[0], parameter=p, jsondata={'color': spec[1]}, number=i) print '--> remapped:', primary_to_segment DBSession.flush() for row in read(args, 'Sociolinguistic_features', 'Sociolinguistic_feature_number'): feature_count += 1 p = data.add(models.Feature, row['Sociolinguistic_feature_code'], name=row['Sociolinguistic_feature_name'], id='%s' % feature_count, description=row['Sociolinguistic_feature_annotation'], area='Sociolinguistic', feature_type='sociolinguistic') names = {} for i in range(1, 10): id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i) if row.get('Value%s' % i) and row['Value%s' % i].strip(): name = row['Value%s' % i].strip() if name in names: name += ' (%s)' % i names[name] = 1 else: continue kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i) data.add(common.DomainElement, id_, id='%s-%s' % (p.id, i), name=name, parameter=p, number=i, jsondata={ 'color': colors.get(row['Value%s_colour_ID' % i], colors.values()[i]) }) sd = {} for row in read(args, 'Segment_data'): if row['Segment_feature_number'] not in number_map: continue number = number_map[row['Segment_feature_number']] if not row['Presence_in_the_language']: continue lang = data['Lect'][row['Language_ID']] param = data['Feature'][number] id_ = '%s-%s' % (lang.id, param.id) if id_ in sd: assert row['c_Record_is_a_duplicate'] == 'Yes' continue sd[id_] = 1 valueset = data.add( common.ValueSet, id_, id=id_, parameter=param, language=lang, contribution=data['ApicsContribution'][row['Language_ID']], description=row['Comments'], markup_description=normalize_markup(row['z_calc_Comments_CSS']), ) v = data.add( common.Value, id_, id=id_, frequency=float(100), valueset=valueset, domainelement=data['DomainElement'][ '%s-%s' % (number, row['Presence_in_the_language'])], ) if row['Example_word'] and row['Example_word_gloss']: example_count[row['Language_ID']] += 1 p = add_sentence(args, data, '%s-p%s' % (lang.id, data['Feature'][number].id), id='%s-%s' % (lang.id, example_count[row['Language_ID']]), name=row['Example_word'], description=row['Example_word_gloss'], language=lang) DBSession.add(common.ValueSentence(value=v, sentence=p)) source = data['Source'].get(row['Refers_to_references_Reference_ID']) if source: DBSession.add( common.ValueSetReference(valueset=valueset, source=source, key=source.id)) elif row['Refers_to_references_Reference_ID'] in non_bibs: valueset.source = non_bibs[ row['Refers_to_references_Reference_ID']] lects = defaultdict(lambda: 1) lect_map = {} records = {} false_values = {} no_values = {} wals_value_number = {} for row in read(args, 'wals'): if row['z_calc_WALS_value_number']: wals_value_number[ row['Data_record_id']] = row['z_calc_WALS_value_number'] def prefix(attr, _prefix): if _prefix: return '%s_%s' % (_prefix, attr) return attr.capitalize() for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]: num_values = 10 for row in read(args, prefix('data', _prefix)): if not row[prefix('feature_code', _prefix)]: print('no associated feature for', prefix('data', _prefix), row[prefix('data_record_id', _prefix)]) continue lid = row['Language_ID'] lect_attr = row.get('Lect_attribute', 'my default lect').lower() if lect_attr != 'my default lect': if (row['Language_ID'], row['Lect_attribute']) in lect_map: lid = lect_map[(row['Language_ID'], row['Lect_attribute'])] else: lang = data['Lect'][row['Language_ID']] c = lects[row['Language_ID']] lid = '%s-%s' % (row['Language_ID'], c) kw = dict( name='%s (%s)' % (lang.name, row['Lect_attribute']), id='%s' % (1000 + 10 * int(lang.id) + c), latitude=lang.latitude, longitude=lang.longitude, description=row['Lect_attribute'], language=lang, ) data.add(models.Lect, lid, **kw) lects[row['Language_ID']] += 1 lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid id_ = abbr + str(row[prefix('data_record_id', _prefix)]) assert id_ not in records records[id_] = 1 assert row[prefix('feature_code', _prefix)] in data['Feature'] language = data['Lect'][lid] parameter = data['Feature'][row[prefix('feature_code', _prefix)]] valueset = common.ValueSet( id='%s-%s' % (language.id, parameter.id), description=row['Comments_on_value_assignment'], markup_description=normalize_markup( row.get('z_calc_Comments_on_value_assignment_CSS')), ) values_found = {} for i in range(1, num_values): if not row['Value%s_true_false' % i]: continue if row['Value%s_true_false' % i].strip().lower() != 'true': assert row['Value%s_true_false' % i].strip().lower() == 'false' false_values[row[prefix('data_record_id', _prefix)]] = 1 continue iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i) if iid not in data['DomainElement']: print(iid, row[prefix('data_record_id', _prefix)], '--> no domainelement!') continue values_found['%s-%s' % (id_, i)] = dict( id='%s-%s' % (valueset.id, i), domainelement=data['DomainElement']['%s-%s' % (row[prefix( 'feature_code', _prefix)], i)], confidence=row['Value%s_confidence' % i], frequency=float(row['c_V%s_frequency_normalised' % i]) if _prefix == '' else 100) if values_found: if row[prefix('data_record_id', _prefix)] in wals_value_number: valueset.jsondata = { 'wals_value_number': wals_value_number.pop(row[prefix( 'data_record_id', _prefix)]) } valueset.parameter = parameter valueset.language = language valueset.contribution = data['ApicsContribution'][ row['Language_ID']] valueset = data.add(common.ValueSet, id_, _obj=valueset) for i, item in enumerate(values_found.items()): if i > 0 and not parameter.multivalued: print 'multiple values for single-valued parameter: %s' % id_ break id_, kw = item kw['valueset'] = valueset value = data.add(common.Value, id_, **kw) # # store references to additional data for segments which should be reused # for corresponding primary features! # if int(parameter.id) in primary_to_segment: assert len(values_found) == 1 seg_id = '%s-%s' % (language.id, primary_to_segment[int( parameter.id)]) seg_valueset = data['ValueSet'][seg_id] seg_value = data['Value'][seg_id] if not valueset.description and seg_valueset.description: valueset.description = seg_valueset.description for s in seg_value.sentence_assocs: DBSession.add( common.ValueSentence(value=value, sentence=s.sentence)) for r in seg_valueset.references: DBSession.add( common.ValueSetReference(valueset=valueset, source=r.source, key=r.key)) if not valueset.source and seg_valueset.source: valueset.source = seg_valueset.source DBSession.flush() else: no_values[id_] = 1 DBSession.flush() for prefix, abbr, num_values in [ ('D', '', 10), ('Sociolinguistic_d', 'sl', 7), ]: for row in read(args, prefix + 'ata_references'): assert row['Reference_ID'] in data['Source'] \ or row['Reference_ID'] in non_bibs try: vs = data['ValueSet'][abbr + str(row[prefix + 'ata_record_id'])] if row['Reference_ID'] in data['Source']: source = data['Source'][row['Reference_ID']] DBSession.add( common.ValueSetReference( valueset=vs, source=source, key=source.id, description=row['Pages'], )) else: if vs.source: vs.source += '; ' + non_bibs[row['Reference_ID']] else: vs.source = non_bibs[row['Reference_ID']] except KeyError: continue DBSession.flush() missing = 0 for row in read(args, 'Value_examples'): try: DBSession.add( common.ValueSentence( value=data['Value']['%(Data_record_id)s-%(Value_number)s' % row], sentence=data['Sentence'][ '%(Language_ID)s-%(Example_number)s' % row], description=row['Notes'], )) except KeyError: missing += 1 print('%s Value_examples are missing data' % missing) print('%s data sets with false values' % len(false_values)) print('%s data sets without values' % len(no_values)) for k, v in wals_value_number.items(): print 'unclaimed wals value number:', k, v for i, row in enumerate(read(args, 'Contributors')): kw = dict(contribution=data['ApicsContribution'][row['Language ID']], contributor=data['Contributor'][row['Author ID']]) if row['Order_of_appearance']: kw['ord'] = int(float(row['Order_of_appearance'])) data.add(common.ContributionContributor, i, **kw) DBSession.flush()
def main(args): data = Data() def maybe_int(c): try: return int(c.value) except Exception: return None contributors = {} xls = xlrd.open_workbook(args.data_file('eWAVE2-Contributors.xlsx')) sheet = xls.sheet_by_name('Tabelle1') fields = [sheet.cell(0, i).value for i in range(sheet.ncols)] for i in range(1, sheet.nrows): values = dict(zip(fields, [sheet.cell(i, j).value for j in range(sheet.ncols)])) contributors[slug(values['Voller Name'])] = values xls = xlrd.open_workbook(args.data_file('ewave.xls')) varieties = {} values = {} matrix = xls.sheet_by_name('matrixRAW-quer') features = [maybe_int(matrix.cell(0, i)) for i in range(matrix.ncols)] for i in range(3, matrix.nrows): values[maybe_int(matrix.cell(i, 1))] = dict( (features[j], matrix.cell(i, j).value.upper()) for j in range(6, matrix.ncols) if features[j]) features = {n: dict(name=matrix.cell(1, i).value) for i, n in enumerate(features)} sheet = xls.sheet_by_name('Example sources') for i in range(sheet.nrows): id = maybe_int(sheet.cell(i, 0)) if id in features: features[id]['example'] = sheet.cell(i, 2).value features[id]['example_source'] = sheet.cell(i, 2).value sheet = xls.sheet_by_name('var-infrmnts-type-regn-lat-lon') for i in range(sheet.nrows): if i == 0: cols = [sheet.cell(i, j).value.lower() for j in range(sheet.ncols)] else: varieties[int(sheet.cell(i, 0).value)] = dict( (cols[j], sheet.cell(i, j).value) for j in range(sheet.ncols)) dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date(2013, 11, 15), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) common.Editor(dataset=dataset, contributor=common.Contributor(id='ed1', name='Bernd Kortmann'), ord=1) common.Editor(dataset=dataset, contributor=common.Contributor(id='ed2', name='Kerstin Lunkenheimer'), ord=2) for id, name, description in [ ('1', 'Pronouns', 'Pronouns, pronoun exchange, nominal gender'), ('2', 'Noun Phrase', 'Noun phrase'), ('3', 'Tense & Aspect', 'Verb phrase I: tense and aspect'), ('4', 'Modal Verbs', 'Verb phrase II: modal verbs'), ('5', 'Verb Morphology', 'Verb phrase III: verb morphology'), #('6', 'Voice', 'Verb phrase IV: voice'), ('6', 'Negation', 'Negation'), ('7', 'Agreement', 'Agreement'), ('8', 'Relativization', 'Relativization'), ('9', 'Complementation', 'Complementation'), ('10', 'Adverbial Subordination', 'Adverbial Subordination'), ('11', 'Adverbs & Prepositions', 'Adverbs and prepositions'), ('12', 'Discourse & Word Order', 'Discourse organization and word order'), ]: data.add( models.FeatureCategory, name, id=id, name=name, description=description) data['FeatureCategory']['Voice'] = data['FeatureCategory']['Verb Morphology'] icons = { 'L1t': {'shape': 's', 'color': 'f38847', 'broad': 'L1'}, 'L1c': {'shape': 'd', 'color': 'd22257', 'broad': 'L1'}, 'L2': {'shape': 'c', 'color': 'a0fb75', 'broad': 'L2'}, 'Cr': {'shape': 't', 'color': 'cb9a34', 'broad': 'P/C'}, 'P': {'shape': 'f', 'color': '4d6cee', 'broad': 'P/C'}, } for cat in read(args, 'language_cat'): cls = models.VarietyType if cat['name'] == 'cat1' else models.Region if cat['name'] == 'cat1' and cat['value'] not in icons: raise ValueError(cat['value']) data.add( cls, cat['value'], id=cat['value'], name=cat['name1'], description=cat['definition'], jsondata=icons.get(cat['value'])) for lang in read(args, 'language'): keys = ['id', 'name', 'latitude', 'longitude'] l = data.add( models.Variety, lang['id'], region=data['Region'][lang['cat2']], type=data['VarietyType'][lang['cat1']], **{k: v for k, v in lang.items() if k in keys}) data.add( models.WaveContribution, lang['id'], id=str(lang['id']), name=lang['name'], description=lang['spec1'], variety=l) for author in read(args, 'o1_author'): contributor = contributors[slug(author['first_name'] + author['last_name'])] data.add( common.Contributor, author['id'], id=str(author['id']), name=contributor['Voller Name'], address=contributor['Affiliation'], email=contributor['E-Mail'], url=contributor['Website']) abbr2lang = {} new_langs = [] desc = { 75: "Philippine English is one of the very few American-transplanted Englishes. " "The language was introduced in the country by American colonization that " "started in 1898. From only 300,000 users or 4% of the population at the " "beginning of the 20th century, it is estimated that there were around 42 " "million or 70% of the population who are able to use English, almost fifty " "years after the American colonization ended at the end of the century " "(Gonzalez, 1996). In the implementing 1987 Constitution, English is regarded as " "one of the two official languages of the Philippines, the other one being the " "national language Filipino. It also interacts with 180 other Austronesian-type " "languages used in the country, nine of them considered major languages. English " "plays a major role in the Philippine society, offering a rightfully unique " "rendering of the psycho-sociolinguistic phenomenon of the spread of English: " "A sizeable number of Filipinos even learn it as a first language (and sometimes " "only language). The language is widely used in government, education, business, " "science and technology, and the arts but it has also penetrated the personal " "and private lives of Filipinos, where code-switching can be prevalent. " "Proficiency in English may also be equated with socio-economic status; those " "with higher socio-economic status tend to be more proficient in the language. " "Philippine English is presently entering a stage of structural " "systematicization (cf. Borlongan & Lim, 2012) and is being codified through " "dictionaries and grammars. Consequently, some claims are made that Philippine " "English is already at the phase of endonormative stabilization (Borlongan, 2011)." } for vid, v in varieties.items(): if vid not in data['Variety']: new_langs.append(vid) l = data.add( models.Variety, vid, id=str(vid), name=v['variety'], latitude=v['latitude'], longitude=v['longitude'], region=[r for r in data['Region'].values() if r.name == v['world region']][0], type=data['VarietyType'][v['variety type (narrow)']]) contribution = data.add( models.WaveContribution, vid, id=str(vid), name=l.name, description=desc.get(vid, ''), variety=l) if v['contributor(s)'] == 'Rajend Mesthrie': v['contributor(s)'] = 'Rajend Mesthrie and Tracey Toefy and Sean Bowerman' for name in v['contributor(s)'].split(' and '): contributor = None name = name.strip() maxid = 0 for c in data['Contributor'].values(): if int(c.id) > maxid: maxid = int(c.id) if c.name == name: contributor = c print '--- already known:', name if not contributor: maxid += 1 contributor = data.add( common.Contributor, maxid, id=str(maxid), name=name) DBSession.add(common.ContributionContributor( contributor=contributor, contribution=contribution)) else: l = data['Variety'][vid] l.abbr = v['abbreviation'].strip() abbr2lang[l.abbr] = l for author in read(args, 'o1_author'): for lang in filter(None, [l.strip() for l in author['langIDs'].split(',')]): DBSession.add(common.ContributionContributor( contributor=data['Contributor'][author['id']], contribution=data['WaveContribution'][int(lang)])) domain = { 'A': ('feature is pervasive or obligatory', {'color': 'fe3856'}), 'B': ('feature is neither pervasive nor extremely rare', {'color': 'ed9c07'}), 'C': ('feature exists, but is extremely rare', {'color': 'efe305'}), 'D': ('attested absence of feature', {'color': 'f3ffb0'}), 'X': ('feature is not applicable (given the structural make-up of the variety/P/C)', {'color': 'e8e8e8'}), '?': ('no information on feature is available', {'color': 'ffffff'}), } for param in read(args, 'lparam'): data.add( models.Feature, param['id'], id=str(param['id']), category=data['FeatureCategory'][param['cat1']], name=param['name'], description=param['name1'], jsondata={'example_source': param['spec1']}) for de in read(args, 'lparamshaping'): desc, jsondata = domain[de['name']] data.add( common.DomainElement, de['id'], id=str(de['id']), parameter=data['Feature'][de['lparam_id']], name=de['name'], description=desc, jsondata=jsondata, number=de['number']) # values: changes = [] maxid = 0 for value in read(args, 'llps'): if not int(value['value']): continue if value['id'] > maxid: maxid = value['id'] de = data['DomainElement'][value['lparamshaping_id']] if de.name != values[value['language_id']][int(de.parameter.id)]: new_de = None for _de in de.parameter.domain: if _de.name == values[value['language_id']][int(de.parameter.id)]: new_de = _de break if not new_de or new_de == de: print values[value['language_id']][int(de.parameter.id)], ' =?= ', de.name changes.append((str(value['language_id']), de.parameter.id, de.name, values[value['language_id']][int(de.parameter.id)])) de = new_de vs = data.add( common.ValueSet, value['id'], id=str(value['id']), contribution=data['WaveContribution'][value['language_id']], parameter=de.parameter, jsondata=de.jsondata, language=data['Variety'][value['language_id']]) data.add( common.Value, value['id'], id=str(value['id']), domainelement=de, valueset=vs) dataset.jsondata['changes'] = {'2013': changes} print len(changes), 'values changed' for new_lang in new_langs: for param, value in values[new_lang].items(): if new_lang == 75 and param == 195 and not value: value = '?' maxid += 1 parameter = data['Feature'][param] de = None for _de in parameter.domain: if _de.name == value: de = _de assert de vs = data.add( common.ValueSet, maxid, id=str(maxid), contribution=data['WaveContribution'][new_lang], parameter=parameter, jsondata=de.jsondata, language=data['Variety'][new_lang]) data.add( common.Value, maxid, id=str(maxid), domainelement=de, valueset=vs) DBSession.flush() for rec in bibtex.Database.from_file(args.data_file('eWAVE2References.bib')): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for i, example in enumerate(excel.rows(xlrd.open_workbook(args.data_file('eWAVE2-Examples_tidy-1.xlsx')).sheets()[0], as_dict=True)): if example['primary_text'] == 'Cf. Table 1 in section 3.1': continue lang = abbr2lang[example['language']] if isinstance(example['feature number'], basestring): fid = re.match('([0-9]+)', example['feature number']).groups()[0] else: fid = example['feature number'] fid = str(int(fid)) s = data.add( common.Sentence, i+1, id=str(i+1), name=example['primary_text'], gloss=example['gloss'] or None, comment=example['comment'] or None, description=example['translation'] or None, language=lang) for ref in (example['Source'] or '').split(';'): if ref: ref = ref.strip() desc = None if ':' in ref: ref, desc = [_s.strip() for _s in ref.split(':', 1)] recid = slug(ref) recid = { 'allsopp996': 'allsopp1996', 'orton1962': 'orton19621971', 'bbcvoices': 'voices', 'cottmann1963': 'cottman1963', 'mooreetal1991': 'moore1991', }.get(recid, recid) if recid not in data['Source']: assert recid == '50' continue DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][recid], description=desc, key=ref)) vs = DBSession.query(common.ValueSet)\ .join(common.Parameter).join(common.Language)\ .filter(common.Parameter.id == fid)\ .filter(common.Language.pk == lang.pk).one() DBSession.add(common.ValueSentence(sentence=s, value=vs.values[0]))