def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) ds.tablegroup.notes.append( OrderedDict([('dc:title', 'environment'), ('properties', OrderedDict([ ('glottolog_version', git_describe(glottolog.repos)), ]))])) ds.add_columns('ValueTable', { 'name': 'Marginal', 'datatype': 'boolean' }, { 'name': 'Allophones', 'separator': ' ' }, 'Contribution_ID') features = [ "tone", "stress", "syllabic", "short", "long", "consonantal", "sonorant", "continuant", "delayedRelease", "approximant", "tap", "trill", "nasal", "lateral", "labial", "round", "labiodental", "coronal", "anterior", "distributed", "strident", "dorsal", "high", "low", "front", "back", "tense", "retractedTongueRoot", "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource", "spreadGlottis", "constrictedGlottis", "fortis", "raisedLarynxEjective", "loweredLarynxImplosive", "click" ] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable') ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL') ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL', ) def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], {}, {}, [] for contrib in read('contributors.csv'): sources.append( dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[ c.strip().lower() for c in contrib.Citation.split(';') ], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', )) pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = pid segments.append( dict(ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features})) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split( ';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict(ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID, URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID]) uniq = set() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug( inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, ) values.append( dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid_map[row.Parameter_ID], Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval( row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) ds.write( **{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def cmd_makecldf(self, args): glottolog = Glottolog(args.glottolog.dir) clts = CLTS(Config.from_file().get_clone('clts')) bipa = clts.bipa clts_eurasian = clts.transcriptiondata_dict['eurasian'] args.writer.cldf.add_columns("ValueTable", { "name": "Marginal", "datatype": "boolean" }, { "name": "Value_in_Source", "datatype": "string" }) args.writer.cldf.add_columns('ParameterTable', { 'name': 'CLTS_BIPA', 'datatype': 'string' }, { 'name': 'CLTS_Name', 'datatype': 'string' }) args.writer.cldf.add_component("LanguageTable", "Family", "Glottolog_Name") # load language mapping and build inventory info languages = [] lang_map = {} all_glottolog = {lng.id: lng for lng in glottolog.languoids()} unknowns = defaultdict(list) for row in progressbar( self.etc_dir.read_csv("languages.csv", dicts=True)): lang_map[row["name"]] = slug(row["name"]) lang_dict = {"ID": slug(row["name"]), "Name": row["name"]} if row["glottocode"] in all_glottolog: lang = all_glottolog[row["glottocode"]] lang_dict.update({ "Family": lang.family if lang.lineage else None, "Glottocode": lang.id, "ISO639P3code": lang.iso_code, "Latitude": lang.latitude, "Longitude": lang.longitude, "Macroarea": lang.macroareas[0].name if lang.macroareas else None, "Glottolog_Name": lang.name, }) languages.append(lang_dict) # Read raw data with open(self.raw_dir.joinpath( 'phono_dbase.json').as_posix()) as handler: raw_data = json.load(handler) # Iterate over raw data values = [] parameters = [] inventories = [] counter = 1 segment_set = set() with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f: sources = [source.strip() for source in f.readlines()][1:] sources_ = Sources.from_file(self.raw_dir / "sources.bib") args.writer.cldf.add_sources(*sources_) for idx, (language, langdata) in enumerate(raw_data.items()): cons = langdata["cons"] vows = langdata["vows"] tones = [tone for tone in langdata["tones"] if tone] source = sources[idx] # Prepare language key lang_key = language.split("#")[0].replace(",", "") # Add consonants and vowels to values, also collecting parameters for segment in cons + vows: marginal = bool(segment[0] == "(") # Obtain the corresponding BIPA grapheme, is possible normalized = normalize_grapheme(segment) par_id = compute_id(normalized) if normalized in clts_eurasian.grapheme_map: sound = bipa[clts_eurasian.grapheme_map[normalized]] else: sound = bipa['<NA>'] unknowns[normalized] += [(segment, lang_key)] if sound.type == 'unknownsound': bipa_grapheme = '' desc = '' else: bipa_grapheme = str(sound) desc = sound.name parameters.append((par_id, normalized, bipa_grapheme, desc)) values.append({ "ID": str(counter), "Language_ID": lang_map[lang_key], "Marginal": marginal, "Parameter_ID": par_id, "Value": normalized, "Value_in_Source": segment, "Source": [source], }) counter += 1 # Build segment data segments = [{ "ID": id, "Name": normalized, "BIPA": bipa_grapheme, "Description": desc } for id, normalized, bipa_grapheme, desc in set(parameters)] # Write data and validate args.writer.write( **{ "ValueTable": values, "LanguageTable": languages, "ParameterTable": segments, }) for g, rest in unknowns.items(): print('\t'.join([repr(g), str(len(rest)), g]))
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') for _, e in bib.entries.items(): for field in e.fields: e.fields[field] = e.fields[field].replace('\\', '') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) def describe_repos(r, org, name=None): return OrderedDict([ ('dc:title', '{0}/{1}'.format(org, name or r.name)), ('dc:description', git_describe(r))]) ds.tablegroup.common_props['prov:wasDerivedFrom'] = [ describe_repos(dev, 'phoible'), describe_repos(scripts, 'bambooforest'), describe_repos(glottolog.repos, 'clld'), ] ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos( Path(__file__).parent, 'cldf-datasets', name='phoible') ds.add_columns( 'ValueTable', {'name': 'Marginal', 'datatype': 'boolean'}, {'name': 'Allophones', 'separator': ' '}, 'Contribution_ID') features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name') table = ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'}, ) table.tableSchema.primaryKey = ['ID'] table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict( columnReference='Contributor_ID', reference=dict(resource='contributors.csv', columnReference='ID')))) table.common_props['dc:conformsTo'] = None table = ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}}, ) table.tableSchema.primaryKey = ['ID'] table.common_props['dc:conformsTo'] = None def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), [] with_tones = {} for contrib in read('contributors.csv'): sources.append(dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[c.strip().lower() for c in contrib.Citation.split(';')], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', with_tones=contrib.with_tones == '1', )) with_tones[contrib.Name] = contrib.with_tones == '1' pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = (pid, row.SegmentClass) segments.append(dict( ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features} )) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split(';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict( ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID.upper(), URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID], count_phonemes=0, count_consonants=0, count_vowels=0, count_tones=0, ) uniq, counts = set(), Counter() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) fam = lang.lineage[0] if lang and lang.lineage else None languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None, Latitude=lang.latitude if lang else None, Longitude=lang.longitude if lang else None, Family_Glottocode=fam[1] if fam else None, Family_Name=fam[0] if fam else None, ) pid, sc = pid_map[row.Parameter_ID] counts.update([(row.Contribution_ID, sc)]) values.append(dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid, Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) for key, count in counts.items(): inventories[key[0]]['count_{0}s'.format(key[1])] = count inventories[key[0]]['count_phonemes'] += count for inv in inventories.values(): if not with_tones[inv['Contributor_ID']]: assert inv['count_tones'] == 0 inv['count_tones'] = 'NA' ds.write(**{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ print('Parsing markdown intros...') for contrib in DBSession.query(models.Contribution): if contrib.description: contrib.markup_description = markdown(contrib.description) else: contrib.markup_description = None print('... done') print('Retrieving language data from glottolog...') catconf = cldfcatalog.Config.from_file() glottolog_path = catconf.get_clone('glottolog') glottolog = Glottolog(glottolog_path) lang_ids = [lang.id for lang in DBSession.query(common.Language)] languoids = {l.id: l for l in glottolog.languoids(lang_ids)} glottocodes = [(l.id, common.Identifier(id=l.id, name=l.id, type='glottolog')) for l in languoids.values()] glottocodes = OrderedDict(sorted(glottocodes, key=lambda t: t[0])) isocodes = [(l.iso, common.Identifier(id=l.iso, name=l.iso, type='iso639-3')) for l in languoids.values() if l.iso] isocodes = OrderedDict(sorted(isocodes, key=lambda t: t[0])) DBSession.add_all(glottocodes.values()) DBSession.add_all(isocodes.values()) DBSession.flush() for lang in DBSession.query(common.Language): if lang.id not in languoids: continue languoid = languoids[lang.id] lang.name = languoid.name lang.latitude = languoid.latitude lang.longitude = languoid.longitude lang.macroarea = languoid.macroareas[ 0].name if languoid.macroareas else '' DBSession.add( common.LanguageIdentifier( language=lang, identifier_pk=glottocodes[languoid.id].pk)) if languoid.iso in isocodes: DBSession.add( common.LanguageIdentifier( language=lang, identifier_pk=isocodes[languoid.iso].pk)) DBSession.flush() print('... done') print('Making pretty colourful dots for parameter values...') all_icons = [icon.name for icon in ORDERED_ICONS] code_query = DBSession.query(common.DomainElement)\ .order_by(common.DomainElement.parameter_pk, common.DomainElement.id) for _, param_codes in groupby(code_query, lambda c: c.parameter_pk): icons = cycle(all_icons) for code in param_codes: code.update_jsondata(icon=next(icons)) DBSession.flush() print('... done')