def cmd_makecldf(self, args): glottolog = Glottolog(args.glottolog.dir) clts = CLTS(Config.from_file().get_clone('clts')) bipa = clts.bipa clts_eurasian = clts.transcriptiondata_dict['eurasian'] args.writer.cldf.add_columns("ValueTable", { "name": "Marginal", "datatype": "boolean" }, { "name": "Value_in_Source", "datatype": "string" }) args.writer.cldf.add_columns('ParameterTable', { 'name': 'CLTS_BIPA', 'datatype': 'string' }, { 'name': 'CLTS_Name', 'datatype': 'string' }) args.writer.cldf.add_component("LanguageTable", "Family", "Glottolog_Name") # load language mapping and build inventory info languages = [] lang_map = {} all_glottolog = {lng.id: lng for lng in glottolog.languoids()} unknowns = defaultdict(list) for row in progressbar( self.etc_dir.read_csv("languages.csv", dicts=True)): lang_map[row["name"]] = slug(row["name"]) lang_dict = {"ID": slug(row["name"]), "Name": row["name"]} if row["glottocode"] in all_glottolog: lang = all_glottolog[row["glottocode"]] lang_dict.update({ "Family": lang.family if lang.lineage else None, "Glottocode": lang.id, "ISO639P3code": lang.iso_code, "Latitude": lang.latitude, "Longitude": lang.longitude, "Macroarea": lang.macroareas[0].name if lang.macroareas else None, "Glottolog_Name": lang.name, }) languages.append(lang_dict) # Read raw data with open(self.raw_dir.joinpath( 'phono_dbase.json').as_posix()) as handler: raw_data = json.load(handler) # Iterate over raw data values = [] parameters = [] inventories = [] counter = 1 segment_set = set() with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f: sources = [source.strip() for source in f.readlines()][1:] sources_ = Sources.from_file(self.raw_dir / "sources.bib") args.writer.cldf.add_sources(*sources_) for idx, (language, langdata) in enumerate(raw_data.items()): cons = langdata["cons"] vows = langdata["vows"] tones = [tone for tone in langdata["tones"] if tone] source = sources[idx] # Prepare language key lang_key = language.split("#")[0].replace(",", "") # Add consonants and vowels to values, also collecting parameters for segment in cons + vows: marginal = bool(segment[0] == "(") # Obtain the corresponding BIPA grapheme, is possible normalized = normalize_grapheme(segment) par_id = compute_id(normalized) if normalized in clts_eurasian.grapheme_map: sound = bipa[clts_eurasian.grapheme_map[normalized]] else: sound = bipa['<NA>'] unknowns[normalized] += [(segment, lang_key)] if sound.type == 'unknownsound': bipa_grapheme = '' desc = '' else: bipa_grapheme = str(sound) desc = sound.name parameters.append((par_id, normalized, bipa_grapheme, desc)) values.append({ "ID": str(counter), "Language_ID": lang_map[lang_key], "Marginal": marginal, "Parameter_ID": par_id, "Value": normalized, "Value_in_Source": segment, "Source": [source], }) counter += 1 # Build segment data segments = [{ "ID": id, "Name": normalized, "BIPA": bipa_grapheme, "Description": desc } for id, normalized, bipa_grapheme, desc in set(parameters)] # Write data and validate args.writer.write( **{ "ValueTable": values, "LanguageTable": languages, "ParameterTable": segments, }) for g, rest in unknowns.items(): print('\t'.join([repr(g), str(len(rest)), g]))
def cmd_makecldf(self, args): args.writer.cldf.add_component('ParameterTable') args.writer.cldf.add_component( 'LanguageTable', 'Continent', 'Genus', 'WALSCode', # we add more language metadata ) args.writer.cldf.add_component('CodeTable') args.writer.objects['ParameterTable'] = [{ 'ID': 'sortalclassifier', 'Name': 'sortal classifier', 'Description': 'Does the language have sortal classifiers, regardless of optional of obligatory?' }, { 'ID': 'morphosyntacticplural', 'Name': 'morphosyntactic plural', 'Description': 'Does the language have morphosyntactic plural markers?' }] args.writer.objects['CodeTable'] = [ { 'ID': 'sortalclassifier-1', 'Parameter_ID': 'sortalclassifier', 'Name': 'yes' }, { 'ID': 'sortalclassifier-0', 'Parameter_ID': 'sortalclassifier', 'Name': 'no' }, { 'ID': 'morphosyntacticplural-1', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'yes' }, { 'ID': 'morphosyntacticplural-0', 'Parameter_ID': 'morphosyntacticplural', 'Name': 'no' }, ] l2s = collections.defaultdict(list) sources = [] for src in sorted(Sources.from_file(self.raw_dir / 'sources.bib').items(), key=lambda i: i.id): if src.get('Wals_code'): for code in split_text(src['Wals_code'], ';', strip=True): l2s[code].append(src.id) sources += [src] args.writer.cldf.add_sources(*sources) for row in self.raw_dir.read_csv('GSSG_ListOfLanguages.csv', delimiter=';', dicts=True): lidx = slug(row['language_name'], lowercase=False) args.writer.objects['LanguageTable'].append({ 'ID': lidx, 'Name': row['language_name'], 'Latitude': row['latitude'], 'Longitude': row['longitude'], 'Glottocode': row['glottocode'], 'ISO639P3code': row['iso_code'], 'Continent': row['continent'], 'Genus': row['genus'], 'WALSCode': row['wals_code'] }) for param in ['sortal_classifier', 'morphosyntactic_plural']: pid = param.replace('_', '') args.writer.objects['ValueTable'].append({ "ID": '{}-{}'.format(lidx, pid), "Value": row[param], "Language_ID": lidx, "Parameter_ID": pid, "Code_ID": '{}-{}'.format(pid, '1' if row[param] == 'yes' else '0'), "Source": l2s.get(row['wals_code'], []) })
def get_data(cldf, args): relscount = 0 cldf.sources = Sources.from_file(args.repos.path('sources.bib')) categorical_variables = set() data = collections.defaultdict(list) dsids = [ds.id for ds in args.repos.datasets] for ds in args.repos.datasets: data['datasets.csv'].append({ 'ID': ds.id, 'Name': ds.name, 'Description': ds.description, 'Type': ds.type, 'Year': ds.year, 'Author': ds.author, 'Reference': ds.reference, 'URL': ds.url, }) for soc in ds.societies: data['LanguageTable'].append({ 'ID': soc.id, 'Dataset_ID': ds.id, 'Name': soc.pref_name_for_society, 'Glottocode': soc.glottocode, 'Latitude': soc.Lat, 'Longitude': soc.Long, 'Comment': soc.Comment, 'Glottocode_Comment': soc.glottocode_comment, 'xd_id': soc.xd_id, 'ORIG_name_and_ID_in_this_dataset': soc.ORIG_name_and_ID_in_this_dataset, 'alt_names_by_society': soc.alt_names_by_society, 'main_focal_year': soc.main_focal_year, 'HRAF_ID': soc.HRAF_name_ID.id if soc.HRAF_name_ID else None, 'HRAF_Name': soc.HRAF_name_ID.name if soc.HRAF_name_ID else None, 'HRAF_Link': soc.HRAF_link, 'origLat': soc.origLat, 'origLong': soc.origLong, }) for soc in ds.society_relations: for rel in soc.related: relscount += 1 data['society_relations.csv'].append({ 'ID': str(relscount), 'Society_ID': soc.id, 'Related_Society_ID': rel.id if rel.dataset in dsids else None, 'Related_Society_External_ID': rel.id if rel.dataset not in dsids else None, 'Related_Society_Name': rel.name, 'Related_Society_Dataset': rel.dataset, }) for param in ds.variables: data['ParameterTable'].append({ 'ID': param.id.replace('.', '_'), 'Dataset_ID': ds.id, 'Name': param.title, 'Description': param.definition, "Category": param.category, "Type": param.type, "Units": param.units, "Source": param.source, "Changes": param.changes, "Notes": param.notes, }) for code in param.codes: if code.code == 'NA': continue categorical_variables.add(code.var_id) data['CodeTable'].append({ 'ID': '{}-{}'.format(code.var_id, code.code).replace('.', '_'), 'Parameter_ID': code.var_id.replace('.', '_'), 'Name': code.name, 'Description': code.description, }) codes = set(c['ID'] for c in data['CodeTable']) for i, d in enumerate(ds.data, start=1): code_id = None \ if (d.var_id not in categorical_variables) or d.code == 'NA' \ else '{}-{}'.format(d.var_id, d.code).replace('.', '_') if code_id and (code_id not in codes) and args.fix_code_id: # This is a backwards compatibility fix. New releases should not have references # to undefined codes! code_id = None # pragma: no cover data['ValueTable'].append({ 'ID': '{}-{}'.format(ds.id, i), 'Language_ID': d.soc_id, 'Parameter_ID': d.var_id.replace('.', '_'), 'Dataset_ID': ds.id, 'Code_ID': code_id, 'Value': d.code, 'Comment': d.comment, 'Sub_Case': d.sub_case, 'Year': d.year, 'Source': [ref.format_cldf() for ref in d.references], 'Source_Coded_Data': d.source_coded_data, 'Admin_Comment': d.admin_comment, }) return data