def cmd_download(self, args): # https://diacl.ht.lu.se/GeoJson/GeographicalPresence/24 print("Download wordlists ...") wordlists = self._download_json("WordLists") for wlid in progressbar(list(wordlists.keys())): # We download the XML representations, because only these seem to contain source info # per lexeme. self.raw_dir.download( self._url("/Xml/WordListWithLanguageLexemes/{0}".format(wlid)), "wl{0}.xml".format(wlid), skip_if_exists=True, ) print("... done") print("Download etymologies ...") etymologies_by_wordlistitem = OrderedDict() for wl in wordlists.values(): print(wl["Name"]) for wlc in wl["WordListCategories"].values(): print("-- ", wlc["Name"]) for wli in progressbar(wlc["WordListItems"]): data = self._download_json("WordListLexemesWithAncestors/{0}".format(wli)) del data["lexemes"] del data["languages"] etymologies_by_wordlistitem[wli] = data with gzip.GzipFile(str(self.raw_dir.joinpath("etymology.json.gz")), "w") as fp: fp.write(dumps(etymologies_by_wordlistitem).encode("utf8")) for p in self.raw_dir.glob("WordListLexemesWithAncestors*"): Path.unlink(p) print("... done") self._download_json("LanguageTree")
def cmd_makecldf(self, args): # add the bibliographic sources args.writer.add_sources() # add the languages from the language list (no need for mapping here) args.writer.add_languages() # add the concepts from the concept list concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): cid = "%s_%s" % (concept.id.split("-")[-1], slug(concept.english)) args.writer.add_concept( ID=cid, Name=concept.english, NorthEuralex_Gloss=concept.attributes["nelex_id"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.attributes["nelex_id"]] = cid # add items lexeme_rows = self.raw_dir.read_csv("nelex.tsv", delimiter="\t", dicts=True) for row in progressbar(lexeme_rows): args.writer.add_form( Language_ID=row["Language_ID"], Parameter_ID=concept_lookup[row["Concept_ID"]], Value=row["Word_Form"], Form=row["rawIPA"], Source=["Dellert2020"], )
def cmd_makecldf(self, args): args.writer.add_sources() concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name" ) languages = args.writer.add_languages(id_factory=lambda l: l["Name"]) reader = self.raw_dir.read_csv(self.raw_dir / "Wordlist.tsv", dicts=True, delimiter="\t") for row in progressbar(reader): lexemes = {k: v for k, v in row.items() if k in languages} for language, lexeme in lexemes.items(): args.writer.add_forms_from_value( Language_ID=language, Parameter_ID=concepts[row["CONCEPT"]], Value=lexeme, Source="Mitterhofer2013", Loan=False, ) # We explicitly remove the ISO code column since the languages in # this datasets do not have an ISO code. args.writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in args.writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ]
def cmd_makecldf(self, args): # add sources args.writer.add_sources() # add languages languages = args.writer.add_languages(lookup_factory="Name") # add concepts concepts = args.writer.add_concepts( id_factory=lambda cpt: "%s_%s" % (cpt.id.split("_")[0], slug(cpt.english)), lookup_factory="Name", ) # Hard-coded fixes to segment errors in raw source segments = { "áː": "áː/aː", "âː": "âː/aː", "aʰ": "a h", "ɐ̃ʰ": "ɐ̃ h", "í": "í/i", "íː": "íː/iː", "iʰ": "i h", "i̥": "i̥/i", "ka": "k a", "kw": "kʷ", # the single instance is a labialized velar "nⁱ": "n i", "óː": "óː/oː", "teː": "t eː", "ú": "u/u", '#': '+' } # read wordlist with lingpy wl_file = self.raw_dir / "Bruzzi_Granadillo.txt" wl = lingpy.Wordlist(wl_file.as_posix()) # iterate over wordlist for idx in progressbar(wl, desc="makecldf"): # write lexemes lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "entrj_in_source"], Form=wl[idx, "ipa"], Segments=" ".join( [segments.get(x, x) for x in wl[idx, "tokens"]]).split(), Source=[ "granadillo_ethnographic_2006", "silva_discoteca_1961" ], ) args.writer.add_cognate( lexeme=lex, Cognateset_ID=wl[idx, "cogid"], Source=["Chacon2019"], )
def cmd_makecldf(self, args): args.writer.add_sources(*self.etc_dir.read_bib()) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english), lookup_factory=lambda c: c['ID'].split('_')[0]) for wl in progressbar(self.iter_wordlists(args.log), desc="cldfify"): wl.to_cldf(args.writer, concepts) # Now normalize the typedby and checkedby values: args.writer.objects['LanguageTable'][-1] = normalize_contributors( args.writer.objects['LanguageTable'][-1])
def cmd_makecldf(self, args): # due to bad concept ids in STEDT, we need to load them from file converter = defaultdict(set) for row in self.raw_dir.read_csv("srcids.tsv", delimiter="\t", dicts=True): converter[row["CORRECTED"]].add(row["IDINSTEDT"]) concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Number=concept.number, ) concept_lookup[concept.number] = idx for id_in_source in converter[concept.number]: concept_lookup[id_in_source] = idx language_lookup = args.writer.add_languages(lookup_factory="Name") args.writer.add_sources() for row in progressbar(self.raw_dir.read_csv("AH-CSDPN.tsv", delimiter="\t")[1:]): args.writer.add_forms_from_value( Local_ID=row[0], Language_ID=language_lookup[row[6]], Parameter_ID=concept_lookup[row[7]], Value=row[1], Source=["Hale1973"], ) for row in progressbar(self.raw_dir.read_csv("AH-CSDPN.tsv", delimiter="\t")[1:]): args.writer.add_forms_from_value( Local_ID=row[0], Language_ID=language_lookup[row[6]], Parameter_ID=concept_lookup[row[7]], Value=row[1], Source=["Hale1973"], )
def cmd_makecldf(self, args): # sources are poorly annotated, so we need to correct manually src = { "H&R92": "huber_vocabulario_1992", "H&R 1992": "huber_vocabulario_1992", "Melendez 2011": "melendez_lozano_diccionario_2011", "Allin 1979": "allin_vocabulario_1979", "Aikhenvald 2012": "aikhenvald_dicionario_2012", "Aikenvald2001": "aihenvald_dicionario_2001", "Oliveira 93": "cunha_de_oliveira_uma_1993", "Ramirez2001": "ramirez_dicionario_2001", "Ramirez 2001": "ramirez_dicionario_2001", "Schauer 2005": "schauer_diccionario_2005", "Aikhenvald 2001": "aikhenvald_dicionario_2001", } # add source args.writer.add_sources() # add languages languages = args.writer.add_languages(lookup_factory="Name") # add concepts concepts = args.writer.add_concepts( id_factory=lambda c: "%s_%s" % (c.id.split("-")[-1], slug(c.english)), lookup_factory="Name", ) # read raw wordlist add lexemes wl_file = self.raw_dir / "arawakan_swadesh_100_edictor.tsv" wl = lingpy.Wordlist(wl_file.as_posix()) for idx in progressbar(wl, desc="makecldf"): if wl[idx, "value"]: if wl[idx, 'segments'][0] == '_': wl[idx, 'segments'] = wl[idx, 'segments'][1:] lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "segments"], Source=src.get(wl[idx, "source"], "Chacon2017"), ) # add cognate args.writer.add_cognate(lexeme=lex, Cognateset_ID=wl[idx, "cogid"], Source=["Chacon2017"])
def cmd_makecldf(self, args): args.writer.add_sources() languages = args.writer.add_languages(id_factory=lambda l: l["Name"], lookup_factory=lambda l: (l["Name"], l["Source"])) sources = {k[0]: k[1] for k in languages} # language: source map concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in progressbar( self.raw_dir.read_csv("madang.csv", dicts=True, delimiter="\t")): concept = CONCEPT_REMAPPING.get(row["CONCEPT"], row["CONCEPT"]) args.writer.add_forms_from_value( Local_ID=row["ID"], Language_ID=row["DOCULECT"], Parameter_ID=concepts[concept], Value=row["COUNTERPART"], Source=sources[row["DOCULECT"]], )
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ data = self.raw_dir.read_csv('wordlist.tsv', dicts=True, delimiter='\t') args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="ID") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in progressbar(data, desc="cldfify"): if row["DOCULECT"] in languages: args.writer.add_forms_from_value( Language_ID=row["DOCULECT"], Parameter_ID=concepts[row["CONCEPT"]], Value=row["TRANSCRIPTION"], Source=["chinds"], )
def cmd_makecldf(self, args): args.writer.add_sources() concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") languages = args.writer.add_languages(id_factory=lambda l: l["Name"]) reader = self.raw_dir.read_csv(self.raw_dir / "Wordlist.tsv", dicts=True, delimiter="\t") for row in progressbar(reader): lexemes = {k: v for k, v in row.items() if k in languages} for language, lexeme in lexemes.items(): args.writer.add_forms_from_value( Language_ID=language, Parameter_ID=concepts[row["CONCEPT"]], Value=lexeme, Source="Mitterhofer2013", Loan=False, )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('raw.tsv', delimiter="\t", dicts=True) # Quite a hack to allow things like "1995.pdfb" as Source IDs: bib = pybtex.database.parse_string(self.raw_dir.read('sources.bib'), bib_format='bibtex') sources = [] for k, e in bib.entries.items(): # Unfortunately, Source.from_entry does not allow any keyword arguments to be passed # to the constructor, see https://github.com/cldf/pycldf/issues/99 e.fields['_check_id'] = False sources.append(Source.from_entry(k, e)) args.writer.add_sources(*sources) language_lookup = args.writer.add_languages(lookup_factory='NameInSource') concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.id.split('-')[-1]+'_'+slug(x.english), lookup_factory='Name' ) lang_sources = {l['NameInSource']: l['Source'].split(",") for l in self.languages} # remap concepts for personal pronouns remap_concepts = { '1SG pronoun': '1sg pronoun', '2SG pronoun': '2sg pronoun', '3SG pronoun': '3sg pronoun', } for line_dict in progressbar(data, desc='cldfify'): concept = line_dict['Meaning'] concept_id = concept_lookup.get(remap_concepts.get(concept, concept)) for language, language_id in language_lookup.items(): value = line_dict[language].strip() if value: args.writer.add_form( Value=value, Form=value, Parameter_ID=concept_id, Language_ID=language_id, Source=lang_sources[language] )
def cmd_makecldf(self, args): wl = lingpy.Wordlist((self.raw_dir / "D_old-clics.tsv").as_posix()) src = {"logos": "Logos2008"} args.writer.add_sources(*self.raw_dir.read_bib()) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for k in progressbar(wl): if wl[k, "value"]: args.writer.add_language( ID=slug(wl[k, "doculect"], lowercase=False), Name=wl[k, "doculect"], Glottocode=wl[k, "glottolog"], ) args.writer.add_form( Language_ID=slug(wl[k, "doculect"], lowercase=False), Parameter_ID=concepts[wl[k, "concept"]], Value=wl[k, "value"], Form=wl[k, "value"], Source=src.get(wl[k, "source"], ""), )
def cmd_makecldf(self, args): args.writer.add_sources() data = self.raw_dir.read_csv("zhang2019-oc-rgyal.tsv", dicts=True, delimiter="\t") # add languages languages = args.writer.add_languages(lookup_factory='Name') languages_dict = {} for lan in self.languages: languages[lan['Name']] = {'Source' :lan['Source'], 'ID':lan['ID']} # add concepts concepts = {} for concept in self.concepts: idx = '{0}_{1}'.format( concept['NUMBER'], slug(concept['ENGLISH'])) args.writer.add_concept( ID=idx, Name=concept['ENGLISH'], Chinese_Gloss=concept['CHINESE'], Gloss_in_Source=concept['GLOSS_IN_SOURCE'] ) concepts[concept['CHINESE'].strip()] = idx for cogid, entry in progressbar( enumerate(data), desc="cldfify", total=len(data) ): for language, value in languages.items(): if entry[language].strip(): for row in args.writer.add_forms_from_value( Language_ID=value['ID'], Parameter_ID=concepts[entry["Chinese_character"]], Value=entry[language], Source=[value['Source']] ): args.writer.add_cognate( lexeme=row, Cognateset_ID=cogid+1)
def cmd_makecldf(self, args): data = self.raw_dir.read_csv("forms.csv", dicts=True) args.writer.add_sources() # TODO: add concepts with `add_concepts` concept_lookup = {} for concept in self.concepts: idx = concept['sort'] + "_" + slug(concept['eng']) args.writer.add_concept( ID=idx, Name=concept['eng'], Number=concept['sort'], Russian_Gloss=concept['rus'], Concepticon_ID=concept['СС_no'] if concept['СС_no'] != '0' else '', #Concepticon_Gloss=concept['eng'] if concept['eng'] else '', ) concept_lookup[concept['sort']] = idx language_lookup = {} for language in self.languages: args.writer.add_language(ID=language['lang.id'], Name=language['lang.name'], Glottocode=language['glottocode'], Latitude=language['latitude'], Longitude=language['longitude']) language_lookup[language['lang.name']] = language['lang.id'] for k in progressbar(data, desc="wl-to-cldf"): if (not k['subentry'] or k['subentry'] == 'sg') and k['lc.id'] in concept_lookup: args.writer.add_forms_from_value( Language_ID=k['lang'], Parameter_ID=concept_lookup[k["lc.id"]], Value=k["orthographic"], Source="lexcauc") elif not k['lc.id'] in concept_lookup: print(k['lc.id'])
def cmd_makecldf(self, args): args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Name") # here we need to add the lexemes data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False) for i, row in progressbar(enumerate(data[4:])): number = row[0].strip().strip('.') for j in range(0, len(row)-2, 2): language = data[2][j+2] value = row[j+2] if value.strip() and value.strip() not in ['-----']: if not 'or' in row[3+j]: cogid = str(int(float(row[j+3]))) else: cogid = row[j+3].split()[0] for lexeme in args.writer.add_forms_from_value( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Source='Deepadung2015'): args.writer.add_cognate( lexeme=lexeme, Cognateset_ID=cogid+'-'+number, Source='Deepadung2015')
def cmd_makecldf(self, args): languages = [] number_files = sorted(list((self.raw_dir / "uninumrepo" / "numbers/").glob("**/*.tsv"))) codes = self.raw_dir / "uninumrepo" / "codes.tsv" args.writer.add_sources() concepts = args.writer.add_concepts(id_factory=lambda c: c.english, lookup_factory="Name") for code in self.raw_dir.read_csv(codes, delimiter="\t", dicts=True): # We add additional Glottocodes based on languages.tsv wherever applicable: substitute = list(filter(lambda y: y["Code"] == code["Code"], self.languages)) args.writer.add_language( ID=code["Code"], Name=code["Language name(s)"], Code=code["Code"], Glottocode=substitute[0]["Glottocode"] if substitute else code["Glottocode"], ISO639P3code=code["ISO 639-3"], Script=code["Script"], Locale=code["Locale"], Ethnologue=code["Ethnologue"], Variety=code["Variety"], ) languages.append(code["Code"]) for number_file in progressbar(number_files): lcode = number_file.name.split(".tsv")[0] for entry in self.raw_dir.read_csv(number_file, delimiter="\t"): # entry[0] is the concept. # entry[1] is the lexeme. args.writer.add_lexemes( Language_ID=lcode, Parameter_ID=concepts[entry[0]], Value=entry[1], Source="Ritchie2019", )
def cmd_makecldf(self, args): data = self.raw_dir.read_csv('DagLoans_Words.tsv', delimiter="\t", dicts=True) args.writer.add_sources() concepts = {} for concept in self.concepts: idx = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH'])) args.writer.add_concept( ID=idx, Name=concept['ENGLISH'], Number=concept['NUMBER'], Concepticon_ID=concept['CONCEPTICON_ID'], Concepticon_Gloss=concept['CONCEPTICON_GLOSS']) concepts[concept['ENGLISH']] = idx sources, languages = {}, {} for language in self.languages: if language['District'] == 'Dictionary': sources[language['List_ID']] = language['Source'].strip() language['District'] = '' elif language['District'] == 'Expert': language['District'] = '' args.writer.add_language(**language) languages[language['List_ID']] = language['ID'] for row in progressbar(data): lexeme = args.writer.add_form( Language_ID=languages[row['List_ID']], Parameter_ID=concepts[row['Concept']], Local_ID=row['Entry_ID'], Value=row['Standard_Transcription'], Form=row['Word'], Source=sources.get(row['List_ID'], ''), Borrowing_ID=row['Concept_ID'] + '-' + row['Stem']) args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=lexeme['Borrowing_ID'])
def cmd_makecldf(self, args): sound_cat = self.raw_dir.read_json(self.catalog_file_name) # add sources args.writer.add_sources() # add languages from explicit file concepts = {} for concept in self.concepts: args.writer.add_concept(**concept) concepts[concept['IndexInSource']] = concept['ID'] languages = {} for language in self.languages: args.writer.add_language(**language) languages[language['IndexInSource']] = language['ID'] # Load JSON data json_data = self.raw_dir.read_json(self.data_file_name) # collect missing languages missing = set() media = [] args.writer.cldf.add_table('media.csv', 'ID', 'Description', 'URL', 'mimetype', { 'name': 'size', 'datatype': 'integer' }, 'Form_ID', primaryKey=['ID']) args.writer.cldf.add_foreign_key( 'media.csv', 'Form_ID', 'FormTable', 'ID', ) # Add lexemes for idx in progressbar(sorted( json_data['transcriptions'], key=lambda k: (int(json_data['transcriptions'][k]['LanguageIx']), int(json_data['transcriptions'][k]['IxElicitation']), int(json_data['transcriptions'][k]['IxMorphologicalInstance']))), desc='makecldf'): lexeme = json_data['transcriptions'][idx] # Skip over entries with no phonetic transcription, empty # phonetic transicrption and from # different studies (missing language) if 'Phonetic' not in lexeme: # pragma: no cover continue if not lexeme['Phonetic']: continue if lexeme['LanguageIx'] not in languages: # pragma: no cover missing.add(lexeme['LanguageIx']) continue # If there is only one elictation for a meaning # it comes as plain string (otherwise as list). # Turn this string into a list as well. if isinstance(lexeme['Phonetic'], str): lexeme['Phonetic'] = [lexeme['Phonetic']] lexeme['path'] = [lexeme['path']] lexeme['soundPaths'] = [lexeme['soundPaths']] ref_id = None last_altlex = None for i, value in enumerate(lexeme['Phonetic']): v = value.strip() # Skip if value is empty if not v or v in self.form_spec.missing_data: continue # Commas are not allowed! if ',' in v: # pragma: no cover args.log.warn( 'Comma not allowed in /{0}/ for {1} - {2}'.format( value, languages[lexeme['LanguageIx']], lexeme['IxElicitation'])) param_id = concepts['{0}-{1}'.format( lexeme['IxElicitation'], lexeme['IxMorphologicalInstance'])] new = args.writer.add_form( Language_ID=languages[lexeme['LanguageIx']], Local_ID='{0}-{1}-{2}'.format( lexeme['LanguageIx'], lexeme['IxElicitation'], lexeme['IxMorphologicalInstance']), Parameter_ID=param_id, Value=v, Form=v, Loan=(lexeme['RootIsLoanWordFromKnownDonor'] == '1'), Source=self.source_id_array, Variant_Of=ref_id if int(lexeme['AlternativePhoneticRealisationIx'][i]) > 0 else None, ) # add media if isinstance(lexeme['soundPaths'], list)\ and len(lexeme['soundPaths'][0]) > 0\ and len(lexeme['soundPaths'][i][0]) > 0: if lexeme['path'][i] in sound_cat: for bs in sorted( sound_cat[lexeme['path'][i]]['bitstreams'], key=lambda x: x['content-type']): media.append({ 'ID': bs['checksum'], 'Description': lexeme['path'][i], 'URL': 'https://cdstar.shh.mpg.de/bitstreams/{0}/{1}'. format(sound_cat[lexeme['path'][i]]['id'], bs['bitstreamid']), 'mimetype': bs['content-type'], 'size': bs['filesize'], 'Form_ID': new['ID'] }) else: # pragma: no cover args.log.warn( 'Missing sound file name in catalog {0}.'.format( lexeme['path'][i])) # Remember last inserted ID for alternative pronounciations to insert 'Variant_Of'. # This can be done in that way since the downloaded json data are sort # by altlex and altpron. if last_altlex != int(lexeme['AlternativeLexemIx'][i]): ref_id = new['ID'] last_altlex = int(lexeme['AlternativeLexemIx'][i]) # add cognate if desired if self.create_cognates: wcogid = '{0}-{1}'.format( param_id, lexeme['WCogID'][i] if lexeme['WCogID'][i] and int(lexeme['WCogID'][i]) > 1 else '1') args.writer.add_cognate( lexeme=new, Cognateset_ID=wcogid, Source=self.source_id_array, ) args.writer.write(**{'media.csv': media}) for m in sorted(missing): # pragma: no cover args.log.warn('Missing language with ID {0}.'.format(m))
def cmd_makecldf(self, args): wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t") concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Chinese_Gloss=concept.attributes["chinese"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.number.rjust(3, "0")] = [idx, concept] language_lookup = {k["ID_in_Source"]: k for k in self.languages} args.writer.add_languages() args.writer.add_sources() idx = 1 mapping = { 0: [ "doculect", "doculectid", "glottocode", "concept", "glossid", "value", "form", "phonetic", "concepticon_id", "concepticon_gloss", ] } for line in progressbar(wl, desc="load the data"): if not line[0].strip(): phonetic = True if line[0] == "'Ref#": numbers = line phonetic = False idxs = defaultdict(list) elif line[0] == "Gloss": glosses = line elif line[0] in language_lookup and not phonetic: taxon = line[0] for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]): if num.strip() and gloss.strip(): cname = concept_lookup[num[1:]][1] forms = val.split(",") if forms: for form in forms: mapping[idx] = [ language_lookup[taxon]["Name"], taxon, language_lookup[taxon]["Glottocode"], cname.english, num[1:], val, form.strip(), "", # check later for phonetic value cname.concepticon_id, cname.concepticon_gloss, ] idxs[taxon, gloss] += [idx] idx += 1 else: print("missing value", gloss, num, taxon) elif line[0] in language_lookup and phonetic: taxon = line[0] for gloss, val in zip(glosses[1:], line[1:]): if gloss.strip(): these_idx = idxs.get((taxon, gloss)) if not these_idx: pass else: forms = val.split(",") for this_idx, form in zip(these_idx, forms): mapping[this_idx][7] = form # export to lingpy wordlist in raw folder # Wordlist(mapping).output( # "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix() # ) # add data to cldf for idx in progressbar(range(1, len(mapping)), desc="cldfify", total=len(mapping)): vals = dict(zip(mapping[0], mapping[idx])) args.writer.add_lexemes( Language_ID=language_lookup[vals["doculectid"]]["ID"], Parameter_ID=concept_lookup[vals["glossid"]][0], Value=vals["value"], Source=["Castro2015"], )
def cmd_makecldf(self, args): # Add bibliographic sources args.writer.add_sources() # Read raw concept data and add to dataset; at the same time, # build a map between the concept index as used in data and the # concept id in the dataset concept_lookup = {} for cidx, concept in enumerate(self.conceptlists[0].concepts.values()): concept_cldf_id = (concept.id.split("-")[-1] + "_" + slug(concept.english)) concept_lookup[1 + (cidx * 2)] = concept_cldf_id # Add the concept args.writer.add_concept( ID=concept_cldf_id, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) # Add languages and make a map for individual sources language_lookup = args.writer.add_languages( lookup_factory="Source_Name") source_lookup = { entry["Source_Name"]: entry["Source"] for entry in self.languages } # Read raw data and remove headers and rows with reconstructions # (row[0] not in languages) data = self.raw_dir.read_csv("dataset_khobwa.csv") data = data[2:] data = [row for row in data if row[0] in language_lookup] # iterate over the source adding lexemes and collecting cognates for row in progressbar(data, desc="makecldf"): for cid in range(1, len(row), 2): # Skip over rows with empty fields for cogid if not row[cid + 1]: continue # Compute a cognate_id number; lingpy now requires # this to be an integer cognate_id = cid * 100 + int(row[cid + 1]) # Extract the value from the raw data, skipping over # missing or non-existing forms. We need to strip here, # as there are entries with newlines and FormSpec, as the # name implies, does not apply to values. value = row[cid].strip() for lex in args.writer.add_lexemes( Language_ID=language_lookup[row[0]], Parameter_ID=concept_lookup[cid], Value=value, Cognacy=cognate_id, Source=source_lookup[row[0]], ): args.writer.add_cognate( lexeme=lex, Cognateset_ID=cognate_id, Source="Lieberherr2017", )
def cmd_makecldf(self, args): # parse the data from the word document table = [[""]] # we except 9 columns with open(self.raw_dir.joinpath("data.txt").as_posix()) as f: previous = [] for i, line in enumerate(f): rows = [c.strip() for c in line.split("\t")] if rows[0].replace(".", "").isdigit(): table += [rows] else: table[-1][-1] += "/" + rows[0] table[-1] += rows[1:] # load cognates cognates = self.raw_dir.read_csv("cognates.tsv", delimiter="\t")[1:] concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = "{0}-{1}".format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Spanish_Gloss=concept.attributes["spanish"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) for gloss in concept.attributes["lexibank_gloss"]: concepts[gloss] = idx concepts["Frio/(hace frio)"] = concepts["Frio (hace frio)"] args.log.info("added concepts") args.writer.add_sources() cognacy, counter = {}, 1 cogsets = { "A(B)": ["A"], "A/(B)": ["A"], "A/B": ["A", "B"], "A/B/C": ["A", "B", "C"], "A/B/D": ["A", "B", "D"], "A/B?": ["A"], "A/C": ["A", "C"], "B/(A)": ["A"], "B/(a)": ["B"], "B/C": ["B", "C"], "C D": ["C", "D"], "C/(B)": ["C"], "C/B": ["C", "B"], "C/E": ["C", "E"], "D/B": ["D", "B"], "a/(B)": ["A"], "a/A": ["A", "A"], "a/B": ["A", "B"], "ab": ["A", "B"], } languages = args.writer.add_languages(lookup_factory="Name") for i, line in progressbar(enumerate(table[1:])): for j, (language, cell) in enumerate(zip(table[0][2:], line[2:])): if cell.strip(): cognatesets = cogsets.get( cognates[i][j + 1].strip(), [cognates[i][j + 1].strip().upper()] ) for lexeme, cognate in zip( args.writer.add_forms_from_value( Value=cell, Language_ID=languages[language], Parameter_ID=concepts[line[1]], Source=["Pharao2020"], ), cognatesets, ): if cognate in ["?", "-"]: cid = counter counter += 1 else: cid = "{0}-{1}".format(i, cognate) if cid in cognacy: cid = cognacy[cid] else: cognacy[cid] = counter cid = cognacy[cid] counter += 1 if languages[language] == "ProtoUtoAztecan" and "SUA" in cell.strip(): lexeme["Language_ID"] = languages["SUA"] args.writer.add_cognate(lexeme, Cognateset_ID=cid, Source=["Pharao2020"])
def cmd_makecldf(self, args): args.writer.add_sources() languages, sources = {}, {} for language in self.languages: languages[language['Name']] = language['ID'] sources[language['Name']] = language['Source'] args.writer.add_language(**language) concepts = {} for concept in self.concepts: idx = '{0}_{1}'.format(concept['NUMBER'], slug(concept['GLOSS'])) concepts[concept['MSA_NAME'].replace('"', '')] = idx args.writer.add_concept( ID=idx, Name=concept['GLOSS'], Concepticon_ID=concept['CONCEPTICON_ID'], Concepticon_Gloss=concept['CONCEPTICON_GLOSS'], Number=concept['NUMBER'], MSA=concept['MSA_NAME']) converter = { '˗': '-', 'ı': 'ɨ', '_': '+', 'ɴ̣': 'ɴ̩', 'ŋ̣̩': 'ŋ̍', 'ɸ͡x': 'ɸ͡x/ɸ', "ouɚ": "ouɚ/oɚ", "ouə": "ouə", "ʌiə": "ʌiə/ʌə", "aːəiə": "aːəiə/aːə", "œːiə": "œːiə/œːə", "æiə": "æiə/æə", "ɛeə": "ɛeə/ɛə", "ɛiɪ": "ɛiɪ/ɛɪ", "ɛɪə": "ɛɪə/ɛə", "ʊuʌ": "ʊuʌ/ʊʌ", "euə": "euə/eə", "aʊə": "aʊə/aə", "æɪə": "æɪə/æə", "ɛiə": "ɛiə/ɛə", "ɒʊe": "ɒʊe/ɒe", "ɪiə": "ɪiə/ɪə", "iɪə": "iɪə/iə", "æɛo": "æɛo/æo", "æɪɛ": "æɪɛ/æɛ", "əɪɜ": "əɪɜ/əɜ", "ɐuɐ": "ɐuɐ/ɐɐ", "ɔuɐ": "ɔuɐ/ɔɐ", "aɪɐ": "aɪɐ/aɐ", "ɔʊə": "ɔʊə/ɔə", "iuə": "iuə/yə", "œʊɑ": "œʊɑ/œɑ", "ɑʊɔ": "ɑʊɔ/ɑɔ", "ɔɪɛ": "ɔɪɛ/ɔɛ", "oʊɤ": "oʊɤ/oɤ", "ouə": "ouə/oə", "oʊə": "oʊə/oə", "ʊɛʊ": "ʊɛʊ/ɛʊ", "uˡ": "uˡ/u", "ɜıi": "ɜıi ", "ɾ̆": "ɾ̆/r", "ıiı": "ıiı/ɨi", "ɛɪʊ": "ɛɪʊ/ɛʊ", "ʌɪɤ": "ʌɪɤ/ʌɤ", "ɛɪɤ": "ɛɪɤ/ɛɤ", "eiə": "eiə/eə", "eɪə": "eɪə/eə", "øʊə": "øʊə/øə", "æeo": "æeo/æo", "ɛɪɐ": "ɛɪɐ/ɛɐ", "aɪə": "aɪə/aə", "uɛi": "uɛi/ɛi", "m̆": "m̆/m", "ɜıi": "ɜıi/ɜi", "ɒʊə": "ɒʊə/ɒə", "ʧ": "tʃ", "ʦ": "ts", "ʨ": "tɕ", "ʣ": "dz", "ʤ": "dʒ", "ʥ": "dʑ", "ʧʰ": "tʃʰ", "ʦ": "tsʰ", "ʨ": "tɕʰ", "k͡χ": "kx", "aei": "aei/ai" } for f in progressbar(self.raw_dir.joinpath('msa').glob('*.msa')): msa = lingpy.align.sca.MSA(f.as_posix()) cogid = msa.infile.split('_')[-1][:-4] for language, alignment in zip(msa.taxa, msa.alignment): alm = [converter.get(x, x) for x in alignment] seq = [x for x in alm if x != '-'] lexeme = args.writer.add_form_with_segments( Language_ID=languages[language], Parameter_ID=concepts[msa.seq_id.replace('"', '')], Value=''.join(seq), Form=''.join(seq), Segments=seq, Cognacy=cogid, Source=sources[language]) args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=cogid, Alignment=alm, Source=['List2014e'])
def cmd_makecldf(self, args): wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t") concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split('-')[-1] + '_' + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Chinese_Gloss=concept.attributes["chinese"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.number.rjust(3, "0")] = [idx, concept] language_lookup = {k["ID_in_Source"]: k for k in self.languages} args.writer.add_languages() args.writer.add_sources() idx = 1 mapping = { 0: [ "doculect", "doculectid", "glottocode", "concept", "glossid", "value", "phonetic", "concepticon_id", "concepticon_gloss", ] } for line in progressbar(wl, desc="load the data"): if not line[0].strip(): phonetic = True if line[0] == "'Ref#": numbers = line phonetic = False idxs = defaultdict(list) elif line[0] == "Gloss": glosses = line elif line[0] in language_lookup and not phonetic: taxon = line[0] for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]): if num.strip() and gloss.strip(): cname = concept_lookup[num[1:]][1] if val: mapping[idx] = [ language_lookup[taxon]["Name"], taxon, language_lookup[taxon]["Glottocode"], cname.english, num[1:], val, "", # check later for phonetic value cname.concepticon_id, cname.concepticon_gloss, ] idxs[taxon, gloss] += [idx] idx += 1 elif line[0] in language_lookup and phonetic: taxon = line[0] for gloss, val in zip(glosses[1:], line[1:]): if gloss.strip(): these_idx = idxs.get((taxon, gloss)) if not these_idx: pass # export to lingpy wordlist in raw folder # Wordlist(mapping).output( # "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix() # ) # add data to cldf for idx in progressbar(range(1, len(mapping)), desc="cldfify", total=len(mapping)): vals = dict(zip(mapping[0], mapping[idx])) args.writer.add_forms_from_value( Language_ID=language_lookup[vals["doculectid"]]["ID"], Parameter_ID=concept_lookup[vals["glossid"]][0], Value=vals["value"], Source=["Castro2015"], ) # We explicitly remove the ISO code column since the languages in # this datasets do not have an ISO code. args.writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in args.writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ]
def cmd_makecldf(self, args): self._schema(args) args.writer.add_sources() # add the languages from the language file # NOTE: the source lists all languages, including proto-languages, # but the `forms` only include the first 41 in the list language_lookup = args.writer.add_languages(lookup_factory="WOLD_ID") desc_dir = self.cldf_dir / 'descriptions' if not desc_dir.exists(): desc_dir.mkdir() numentries = { r["pk"]: int(r["count_words"]) for r in self.raw_dir.joinpath("db").read_csv("vocabulary.csv", dicts=True) } db_contribs = { r['id']: r for r in self.raw_dir.joinpath('db').read_csv('contribution.csv', dicts=True)} for contrib in self.raw_dir.read_csv("contributions.csv", dicts=True): db_contrib = db_contribs[contrib['ID']] args.writer.objects["ContributionTable"].append( dict( ID=contrib["ID"], Name="{} vocabulary".format(contrib["Name"]), Citation=format_citation(contrib, numentries[contrib["ID"]]), Contributor=contrib["Contributors"], Number_of_words=numentries[contrib["ID"]], Language_ID=language_lookup[contrib["ID"]], ) ) desc = vocabulary_description( contrib['Name'], contrib["Contributors"], json.loads(db_contrib['jsondata'])) p = desc_dir.joinpath('vocabulary_{}.md'.format(contrib['ID'])) p.write_text(desc, encoding='utf8') concepticon = {concept.attributes['wold_id']: concept for concept in self.conceptlists[0].concepts.values()} for parameter in self.raw_dir.read_csv("parameters.csv", dicts=True): concept = concepticon.get(parameter['ID']) args.writer.add_concept( ID=parameter['ID'], Name=concept.english if concept else parameter['Name'], Concepticon_ID=concept.concepticon_id if concept else None, Concepticon_Gloss=concept.concepticon_gloss if concept else None, Core_list=parameter['CoreList'] == 'true', Semantic_field=parameter['SemanticField'], Semantic_category=parameter['SemanticCategory'], Borrowed_score=float(parameter['BorrowedScore']), Age_score=float(parameter['AgeScore']) if parameter['AgeScore'] else None, Simplicity_score=float(parameter['SimplicityScore']), ) form2lexeme = {} wid2fid = collections.defaultdict(set) lexemes_rows = self.raw_dir.read_csv("forms.csv", dicts=True) for row in progressbar(lexemes_rows): # Add information not in row, so we can pass to `add_form()` # with a single comprehension row["Language_ID"] = language_lookup[row["Language_ID"]] row["Parameter_ID"] = row["Parameter_ID"] row["Value"] = row.pop("Form") row["Loan"] = float(row["BorrowedScore"]) > 0.6 row["Borrowed_score"] = row["BorrowedScore"] row["Simplicity_score"] = row["SimplicityScore"] row["original_script"] = normalize_text(row["original_script"]) row["comment_on_borrowed"] = normalize_text(row["comment_on_borrowed"]) row.pop("Segments") row['Age_score'] = decimal.Decimal(row.pop('AgeScore')) if row['AgeScore'] else None row['Age'] = row.pop('age_label') row['Local_ID'] = row['ID'] row['contact_situation'] = row['ContactSituation'] row['Comment'] = row.pop('other_comments') lexemes = args.writer.add_forms_from_value( **{k: v for k, v in row.items() if k in self.lexeme_class.fieldnames()} ) assert len(lexemes) == 1 form2lexeme[row['ID']] = lexemes[0]['ID'] wid2fid[row['Word_ID']].add(lexemes[0]['ID']) words = {r['pk']: r for r in self.raw_dir.joinpath('db').read_csv('unit.csv', dicts=True)} languages = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('language.csv', dicts=True)} codes = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('identifier.csv', dicts=True) if r['type'] == 'glottolog'} glottocodes = { r['language_pk']: codes[r['identifier_pk']] for r in self.raw_dir.joinpath('db').read_csv('languageidentifier.csv', dicts=True) if r['identifier_pk'] in codes} wids = [w['id'] for w in words.values()] for wid in wid2fid: assert wid in wids count = 0 for row in self.raw_dir.joinpath('db').read_csv('loan.csv', dicts=True): assert row['target_word_pk'] in words source_word = None if row ['source_word_pk']: assert row['source_word_pk'] in words source_word = words[row['source_word_pk']] twid = words[row['target_word_pk']]['id'] for fid in wid2fid[twid]: # The meaning-differentiated borrowing events. count += 1 args.writer.objects['BorrowingTable'].append(dict( ID=str(count), Target_Form_ID=fid, Comment='Source word unidentifiable' if source_word['name'].lower() == 'unidentifiable' else None, Source_word=None if source_word['name'].lower() == 'unidentifiable' else source_word['name'], Source_meaning=source_word['description'] or None, Source_languoid=languages[source_word['language_pk']], Source_languoid_glottocode=glottocodes.get(source_word['language_pk']), Source_relation=row['relation'], Source_certain=row['certain'] == 't', ))
def cmd_makecldf(self, args): # Write sources to CLDF args.writer.add_sources() # Collect languages and add to CLDF, also building look-up languages = {} for language in self.languages: args.writer.add_language( ID=language["ID"], Name=language["Name"], Glottocode=language["Glottocode"], ) languages[language["Name"]] = { "ID": language["ID"], "Source": language["Source"].split(","), } # Collect concepts and add to CLDF, also building look-up concepts = collections.OrderedDict() for concept in self.conceptlists[0].concepts.values(): idx = "{0}_{1}".format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.english] = idx # Define a list os string replacements -- as the raw data is already # segmented, and these few cases are actually inconsistencies, it is better # than jus tusing a profile replacements = { "wu": ["w", "u"], "wã": ["w", "ã"], "ja": ["j", "a"], "oj": ["oi"], "kãʔã": ["k", "ã", "ʔ", "ã"], "ej": ["ei̯"], "ij": ["ii̯"], "ɨp": ["ɨ", "p"], "ɪw": ["ɪu̯"], "e͂": ["ẽ"], } missing, missing2 = set(), set() for row, cog in progressbar( zip( self.raw_dir.read_csv("Aligned_matrix_lexical.csv", delimiter=",", dicts=True), self.raw_dir.read_csv("Cognate matrix.csv", delimiter=",", dicts=True), )): language = row[""].strip() for concept, concept_id in concepts.items(): if concept in row: word = row[concept] if word.strip() and language.strip(): segments = [] for segment in word.split(): segments += replacements.get(segment, [segment]) lexeme = args.writer.add_form_with_segments( Language_ID=languages[language]["ID"], Parameter_ID=concept_id, Value=row[concept], Form=row[concept], Segments=segments, Source=languages[language]["Source"], ) if concept in cog: args.writer.add_cognate( lexeme=lexeme, Cognateset_ID="{0}-{1}".format( slug(concept), cog[concept]), Source="gerarditupi", ) else: missing.add(concept) else: missing2.add(concept) # Log missing concepts for concept in missing: args.log.warn("Concept {0} could not be found".format(concept)) for concept in missing2: args.log.warn("Concept {0} missing".format(concept))
def cmd_makecldf(self, args): unknown_gc_cnt = 0 html_files = get_file_paths(self.raw_dir) tables = find_tables(html_files) glottolog_codes = self.glottolog.languoids_by_code() glottolog_iso = self.glottolog.iso.languages concept_map = { cs.english: (cs.concepticon_id, cs.concepticon_gloss) for cs in self.conceptlists[0].concepts.values() } entries = [] for table_set in tables: entry = NumeralsEntry( base_name=table_set[0], tables=table_set[1], file_name=table_set[2], codes=glottolog_codes, iso=glottolog_iso, title_name=table_set[3], source=table_set[4], base=table_set[5], comment=table_set[6], ) entries.append(entry) seen_lg_names = {} lg_variant_counter = {} # with args.writer.cldf as ds: meaning_map = {} args.writer.add_sources(*self.raw_dir.read_bib()) args.writer.cldf['FormTable', 'Problematic'].datatype.base = 'boolean' # remove newly added columns in order to get a good diff args.writer.cldf['FormTable'].tableSchema.columns = [ c for c in args.writer.cldf['FormTable'].tableSchema.columns if c.name != 'Graphemes' and c.name != 'Profile' ] # map old lang_ids (without 'MsoNormalTable' table class) # against new ones to minimize diffs lang_id_map = { "hupd1244-4": ["hupd1244-2", 2 - 1], "hupd1244-2": ["hupd1244-3", 3 - 1], "hupd1244-3": ["hupd1244-4", 4 - 1], "nucl1440-2": ["nucl1440-1", 1 - 1], "nucl1440-3": ["nucl1440-2", 2 - 1], "nucl1440-1": ["nucl1440-3", 3 - 1], "poum1235-2": ["poum1235-1", 1 - 1], "poum1235-1": ["poum1235-2", 2 - 1], "wayu1241-1": ["wayu1241-2", 2 - 1], "wayu1241-2": ["wayu1241-1", 1 - 1], "port1283-1": ["port1283-2", 2 - 1], "port1283-2": ["port1283-1", 1 - 1], } for entry in progressbar(entries, desc="makecldf"): number_lexemes = entry.get_numeral_lexemes() for variety in number_lexemes: for var_id, var in variety.items(): # build language name if var_id < len(entry.title_name): lg_name = entry.title_name[var_id] elif len(entry.title_name): lg_name = entry.title_name[0] else: lg_name = entry.base_name if not entry.ethnologue_codes: entry.ethnologue_codes = [''] # map 'old' glottocodes against new one # to minimize diff if lg_name == 'Enlhet (Lengua), Paraguay': entry.glottocodes = ['leng1262'] if lg_name == 'Gerai, Indonesia': entry.glottocodes = ['sema1269'] if lg_name == 'Southern Ndebele, South Africa': entry.glottocodes = ['sout2808'] if not entry.glottocodes: unknown_gc_cnt += 1 gc = '' lang_id_prefix = 'xxxx%04d' % (unknown_gc_cnt) else: lang_id_prefix = entry.glottocodes[0] gc = lang_id_prefix if lg_name not in seen_lg_names: seen_lg_names[lg_name] = [] seen_lg_names[lg_name].append(entry.file_name) # build Contributor name if var_id < len(entry.source): contrib = entry.source[var_id] else: contrib = None # build Base if var_id < len(entry.base): base = entry.base[var_id] else: base = None # build Comment if var_id < len(entry.comment): com = entry.comment[var_id] else: com = '' if len(set(seen_lg_names[lg_name])) > 1: com = "CHECK with %s: %s" % (entry.file_name, com) if lang_id_prefix not in lg_variant_counter: lg_variant_counter[lang_id_prefix] = 0 lg_variant_counter[lang_id_prefix] += 1 c_lang_id = "%s-%i" % (lang_id_prefix, lg_variant_counter[lang_id_prefix]) # map according to old table parser without 'MsoNormalTable' if c_lang_id in lang_id_map: c_lang_id, var_id = lang_id_map[c_lang_id] args.writer.add_language( ID=c_lang_id, Name=lg_name, Glottocode=gc, ISO639P3code=entry.ethnologue_codes[0], SourceFile=entry.file_name, Contributor=contrib, Base=base, Comment=com, ) for k, vs in var.items(): meaning_n = str(k) for v in vs: if meaning_n not in meaning_map: meaning_map[meaning_n] = str(k) args.writer.add_concept( ID=meaning_map[meaning_n], Name=meaning_n, Concepticon_ID=concept_map.get( meaning_n, '')[0], Concepticon_Gloss=concept_map.get( meaning_n, '')[1], ) if v: value = v.replace("\n", "").replace("\t", "") # after 2 or more non break spaces follows a comment if '(' not in value: value = re.sub(r'^(.*?) {2,}(.*)$', '\\1 (\\2)', value) # after an em dash follows a comment if '(' not in value: value = re.sub(r'^(.*?)\s*–\s*(.*)$', '\\1 (\\2)', value) # replace non break space by spaces value = value.replace(" ", " ") # put single string 'foo = IPA' into brackets if '=' in value and '(' not in value: value = re.sub( r'^(.*?)\s(\S+\s*=\s*IPA.*)$', '\\1 (\\2)', value) value, comment, other_form, loan = value_parser( value) if value: args.writer.add_forms_from_value( Value=value, Parameter_ID=meaning_n, Variant_ID=(var_id + 1), Language_ID=c_lang_id, Comment=comment, Source="chan2019", Other_Form=other_form, Loan=loan, ) def _x(s): try: return int(s) except ValueError: return s args.writer.objects['FormTable'] = sorted( args.writer.objects['FormTable'], key=lambda item: ([_x(i) for i in item['ID'].split('-')])) args.writer.objects['LanguageTable'] = sorted( args.writer.objects['LanguageTable'], key=lambda item: ([_x(i) for i in item['ID'].split('-')])) args.writer.objects['ParameterTable'] = sorted( args.writer.objects['ParameterTable'], key=lambda item: _x(item['ID']))
def cmd_makecldf(self, args): languages = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "languages.json") } words = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "words.json") } sources = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "sources.json") } # handle sources # want to make sure that the bibtex key matches our source id. for source in sorted(sources): # this is ugly, I wish pybtex made this easier! bib = parse_string(sources[source]["bibtex"], "bibtex") old_key = list(bib.entries.keys())[0] bib.entries[old_key].key = source bib.entries = OrderedCaseInsensitiveDict([(source, bib.entries[old_key])]) args.writer.add_sources(bib) # handle languages for lang in sorted(languages): args.writer.add_language( ID=lang, Name=languages[lang]["fullname"], ISO639P3code=languages[lang]["isocode"], Glottocode=languages[lang]["glottocode"], ) # handle concepts concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.english] = idx concepts[concept.english.replace(" ", "-")] = idx concepts[concept.english.replace(" ", "-").lower()] = idx concepts[slug(concept.english)] = idx concepts["-".join([slug(x) for x in concept.english.split()])] = idx if '(' in concept.english: new_string = concept.english[:concept.english.index('(') - 1] concepts["-".join([slug(x) for x in new_string.split()])] = idx concepts[concept.english[:concept.english.index('(') - 1]] = idx concepts[concept.english[:concept.english.index('(') - 1].replace(' ', '-').lower()] = idx if concept.english.startswith("to "): new_string = concept.english[3:] concepts['-'.join([slug(x) for x in new_string.split()])] = idx concepts[concept.english.replace("to ", "")] = idx concepts["mans-mother-law"] = concepts["man's mother in law"] concepts["brother-law"] = concepts["brother in law"] concepts["to-make-hole"] = concepts["make hole (in ground)"] concepts["front"] = concepts["in front"] concepts["husk-nut"] = concepts["husk (of nut)"] concepts["his"] = concepts["his, hers, its (pronoun p:3s)"] concepts["we-two-incl"] = concepts[ "we incl. dual (pronoun d:1p, incl, dual)"] concepts["intrnasitivizer"] = concepts["intransitivizer"] concepts["short-piece-wood"] = concepts["short-piece-of-wood"] concepts["top-foot"] = concepts["top (of foot)"] concepts["sit-feet-and-legs-together"] = concepts[ "sit (with feet and legs together)"] concepts["earth"] = concepts["earth/soil"] concepts["warm"] = concepts["warm/hot"] concepts["your-sg"] = concepts["your (pronoun: p:2s)"] concepts["-law"] = concepts["in-law"] concepts["to-roast"] = concepts["roast"] concepts["arrow-barred"] = concepts[ "arrow (barred) (Arrow with cross bar)"] concepts["them-dual"] = concepts["them (pronoun o:3p, dual)"] concepts["you-dual"] = concepts["you (pronoun d:2s)"] concepts["right-correct"] = concepts["right (correct, true)"] concepts["betelpepper"] = concepts["betelpepper vine"] concepts["to-chop"] = concepts["to chop, cut down"] concepts["road"] = concepts["road/path"] concepts["for-benefactive-clitic"] = concepts[ "for (benefactive) ((cliticised or suffixed to noun))"] concepts["mans-father-law"] = concepts["mans' father in law"] concepts["sister-law"] = concepts["sister in law"] concepts["you-o2s"] = concepts["you (pronoun o:2s)"] concepts["you-pl-o2p"] = concepts["you pl. (pronoun o:2p)"] concepts["we-pl-incl"] = concepts["we incl. (pronoun d:1p, incl)"] concepts["in"] = concepts["in, inside"] concepts["not_know"] = concepts["not know"] concepts["their-dual"] = concepts["their (pronoun p:3p, dual)"] concepts["blow-fire"] = concepts["blow (on fire)"] concepts["blunt-eg-knife"] = concepts["blunt (of e.g. knife)"] concepts["our-dual"] = concepts["our (two) (pronoun p:1p, dual)"] concepts["your-pl-dual"] = concepts[ "your (two) pl (pronoun p:2p, dual)"] concepts["suck-breast"] = concepts["to suck at breast"] concepts["draw-water-carry"] = concepts["draw water / carry"] concepts["tree-sp-Gnetum-gnemon"] = concepts[ "tree sp. (Gnetum gnemon)"] concepts["he-she"] = concepts["he, she, it, that, those"] concepts["fed"] = concepts["fed up (with)"] concepts["you-pl-dual-o2p"] = concepts[ "you plural two (pronoun d:2p, dual)"] concepts["you-pl-dual"] = concepts["you two (pronoun d:2s, dual)"] concepts["to-put"] = concepts["to put, give"] concepts["he-she-it-those"] = concepts["he, she, it, that, those"] concepts["we-two-excl"] = concepts[ "we excl. dual (pronoun d:1p, excl, dual)"] concepts["we-pl-excl"] = concepts[ "we excl. plural (pronoun d:1p, excl, plural)"] #concepts["affix-body-part"] = concepts[""] itemfiles = [ f for f in self.raw_dir.iterdir() if f.name.startswith("language-") ] errors = set() for filename in progressbar(sorted(itemfiles), desc="adding lexemes"): for o in sorted(self.raw_dir.read_json(filename), key=lambda d: d["id"]): wordid = self.get_slug_from_uri(o['word']) if wordid in concepts: args.writer.add_forms_from_value( Local_ID=o["id"], Language_ID=self.get_slug_from_uri(o["language"]), Parameter_ID=concepts[wordid], Value=o["entry"], Source=self.get_slug_from_uri(o["source"]), Comment=o["annotation"], ) else: errors.add(("concept", wordid)) for error in errors: args.log.info("error with {0[0]}: {0[1]}".format(error)) args.log.info("found {0} errors in concepts".format(len(errors)))
def cmd_makecldf(self, args): if not hasattr(self, 'form_placeholder'): self.form_placeholder = None if not hasattr(self, 'only_proto_forms'): # special case for MixeZoque self.only_proto_forms = False sound_cat = self.raw_dir.read_json(self.catalog_file_name) # add sources args.writer.add_sources() # add languages from explicit file concepts = {} for concept in self.concepts: args.writer.add_concept(**concept) concepts[concept['IndexInSource']] = concept['ID'] languages = {} proto_lgs = [] for language in self.languages: args.writer.add_language(**language) languages[language['IndexInSource']] = language['ID'] if language.get('IsProto', '') == 'True': proto_lgs.append(language['IndexInSource']) # Load JSON data json_data = self.raw_dir.read_json(self.data_file_name) # collect missing languages missing = set() # collect lexemes with no transcription but with audio only_snd = [] media = [] args.writer.cldf.add_table( 'media.csv', { 'name': 'ID', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#id', 'valueUrl': 'https://cdstar.shh.mpg.de/bitstreams/{objid}/{fname}', }, 'objid', 'fname', 'mimetype', {'name': 'size', 'datatype': 'integer'}, { "name": "Form_ID", "required": True, "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#formReference", "datatype": "string" }, primaryKey=['ID'] ) args.writer.cldf.add_foreign_key( 'media.csv', 'Form_ID', 'FormTable', 'ID', ) # Add lexemes for idx in progressbar(sorted(json_data['transcriptions'], key=lambda k: ( int(json_data['transcriptions'][k]['LanguageIx']), int(json_data['transcriptions'][k]['IxElicitation']), int(json_data['transcriptions'][k]['IxMorphologicalInstance']) )), desc='makecldf'): lexeme = json_data['transcriptions'][idx] if lexeme['LanguageIx'] not in languages: # pragma: no cover missing.add(lexeme['LanguageIx']) continue # If entry is marked as 'isDummy' => it only has audio if 'isDummy' in lexeme: if self.form_placeholder: if isinstance(lexeme['soundPaths'], list) and isinstance( lexeme['soundPaths'][0], list): lexeme['Phonetic'] = [self.form_placeholder] * len(lexeme['soundPaths']) lexeme['AlternativePhoneticRealisationIx'] = \ ['0'] * len(lexeme['soundPaths']) lexeme['WCogID'] = [''] * len(lexeme['soundPaths']) else: lexeme['Phonetic'] = self.form_placeholder lexeme['AlternativePhoneticRealisationIx'] = '0' lexeme['WCogID'] = '' try: lexeme['path'] = lexeme['soundPaths'][0].split('/')[-1].split('.')[0] except AttributeError: lexeme['path'] = lexeme['soundPaths'][0][0].split('/')[-1].split('.')[0] lexeme['AlternativeLexemIx'] = '0' lexeme['RootIsLoanWordFromKnownDonor'] = '0' # Replace all forms by 'form_placeholder' if language is not a propto language # - a special case for MixeZoque only if self.only_proto_forms \ and self.form_placeholder \ and lexeme['LanguageIx'] not in proto_lgs: if isinstance(lexeme['Phonetic'], str): lexeme['Phonetic'] = [lexeme['Phonetic']] lexeme['path'] = [lexeme['path']] lexeme['soundPaths'] = [lexeme['soundPaths']] lexeme['WCogID'] = [lexeme['WCogID']] for i, v in enumerate(lexeme['Phonetic']): if len(lexeme['soundPaths'][0]) > 0 and len(lexeme['soundPaths'][i][0]) > 0: if lexeme['path'][i] in sound_cat: lexeme['Phonetic'][i] = self.form_placeholder else: lexeme['Phonetic'][i] = '' else: lexeme['Phonetic'][i] = '' if 'Phonetic' not in lexeme: if 'isDummy' in lexeme: only_snd.append(lexeme) continue # If there is only one elictation for a meaning # it comes as plain string (otherwise as list). # Turn relevant items into a list as well. if isinstance(lexeme['Phonetic'], str): lexeme['Phonetic'] = [lexeme['Phonetic']] lexeme['path'] = [lexeme['path']] lexeme['soundPaths'] = [lexeme['soundPaths']] lexeme['WCogID'] = [lexeme['WCogID']] ref_id = None last_altlex = None for i, value in enumerate(lexeme['Phonetic']): v = value.strip() # Skip if value is empty if not v or v in self.form_spec.missing_data: continue # Commas are not allowed! if ',' in v: # pragma: no cover args.log.warning('Comma not allowed in /{0}/ for {1} - {2}'.format( value, languages[lexeme['LanguageIx']], lexeme['IxElicitation'])) param_id = concepts['{0}-{1}'.format( lexeme['IxElicitation'], lexeme['IxMorphologicalInstance'])] new = args.writer.add_form( Language_ID=languages[lexeme['LanguageIx']], Local_ID='{0}-{1}-{2}'.format( lexeme['LanguageIx'], lexeme['IxElicitation'], lexeme['IxMorphologicalInstance']), Parameter_ID=param_id, Value=v, Form=v, Loan=(lexeme['RootIsLoanWordFromKnownDonor'] == '1'), Source=self.get_source_id_array(lexeme), Variant_Of=ref_id if int( lexeme['AlternativePhoneticRealisationIx'][i]) > 0 else None, ) # add media if len(lexeme['soundPaths'][0]) > 0 and len(lexeme['soundPaths'][i][0]) > 0: if lexeme['path'][i] in sound_cat: for bs in sorted(sound_cat[lexeme['path'][i]]['bitstreams'], key=lambda x: x['content-type']): media.append({ 'ID': bs['checksum'], 'fname': bs['bitstreamid'], 'objid': sound_cat[lexeme['path'][i]]['id'], 'mimetype': bs['content-type'], 'size': bs['filesize'], 'Form_ID': new['ID'] }) else: # pragma: no cover args.log.warning('Missing sound file name in catalog {0}.'.format( lexeme['path'][i])) # Remember last inserted ID for alternative pronounciations to insert 'Variant_Of'. # This can be done in that way since the downloaded json data are sort # by altlex and altpron. if last_altlex != int(lexeme['AlternativeLexemIx'][i]): ref_id = new['ID'] last_altlex = int(lexeme['AlternativeLexemIx'][i]) # add cognate if desired if self.create_cognates: wcogid = None if lexeme['WCogID'][i].strip(): try: wid = int(lexeme['WCogID'][i]) if wid > 0: wcogid = '{0}-{1}'.format(param_id, wid) except ValueError: wcogid = '{0}-{1}'.format(param_id, lexeme['WCogID'][i]) if wcogid: args.writer.add_cognate( lexeme=new, Cognateset_ID=wcogid, Source=self.source_id_array, ) args.writer.write( **{'media.csv': media} ) if self.form_placeholder: args.writer.cldf['FormTable', 'Value'].common_props['dc:description'] = \ '► := no value, but audio' args.writer.cldf['FormTable', 'Form'].common_props['dc:description'] = \ '► := no form, but audio' for m in sorted(missing): # pragma: no cover args.log.warning('Missing language with ID {0}.'.format(m)) if len(only_snd): args.log.info('Consider to use "form_placeholder" to import lexemes without ' 'transcription but with audio:') for m in only_snd: # pragma: no cover args.log.warning('Missing transcription for {0}-{1}-{2}.'.format( m['LanguageIx'], m['IxElicitation'], m['IxMorphologicalInstance']))
def cmd_makecldf(self, args): from pybtex import errors, database errors.strict = False bibdata = database.parse_file( str(self.raw_dir.joinpath('bibliography', 'sources.bib'))) args.writer.add_sources(bibdata) args.writer["FormTable", "Segments"].datatype = Datatype.fromvalue({ "base": "string", "format": "([\\S]+)( [\\S]+)*" }) args.writer["FormTable", "Morphemes"].separator = " " args.writer["FormTable", "PartialCognates"].separator = " " concepts = {} errors, blacklist = set(), set() for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Portuguese_Gloss=concept.attributes["portuguese"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, EOL_ID=concept.attributes["eol"], Semantic_Field=concept.attributes["semantic_field"]) concepts[concept.english] = idx languages = {} sources = {} for row in self.languages: if not -90 < float(row['Latitude']) < 90: errors.add('LATITUDE {0}'.format(row['Name'])) elif not -180 < float(row['Longitude']) < 180: errors.add('LONGITUDE {0}'.format(row['Name'])) else: try: args.writer.add_language( ID=row['ID'], Name=row['Name'], SubGroup=row['SubGroup'], Latitude=row['Latitude'], Longitude=row['Longitude'], Glottocode=row['Glottocode'] if row['Glottocode'] != '???' else None, ) languages[row['Name']] = row['ID'] sources[row['Name']] = [] for source in row['Sources'].split(','): if source in bibdata.entries: sources[row['Name']] += [source] else: errors.add('BIBTEX MISSING {0}'.format(source)) except ValueError: errors.add('LANGUAGE ID {0}'.format(row['ID'], )) args.log.warn('Invalid Language ID {0}'.format(row['ID'])) wl = lingpy.Wordlist(self.raw_dir.joinpath('tuled.tsv').as_posix()) etd = wl.get_etymdict(ref='cogids') alignments, problems = {}, set() for cogid, vals in progressbar(etd.items(), desc='aligning data'): idxs = [] for idx in vals: if idx: idxs += idx positions = [wl[idx, 'cogids'].index(cogid) for idx in idxs] alms, new_idxs = [], [] for idx, pos in zip(idxs, positions): try: tks = lingpy.basictypes.lists(wl[idx, 'tokens']).n[pos] if not ' '.join(tks).strip(): raise IndexError alms += [tks] new_idxs += [(idx, pos)] except IndexError: problems.add((idx, pos)) if alms: msa = lingpy.Multiple(alms) msa.prog_align() for i, alm in enumerate(msa.alm_matrix): alignments[new_idxs[i][0], new_idxs[i][1], cogid] = ' '.join(alm) else: errors.add('ALIGNMENT empty {0}'.format(cogid)) bipa = CLTS(args.clts.dir).bipa for idx, tokens, glosses, cogids, alignment in wl.iter_rows( 'tokens', 'morphemes', 'cogids', 'alignment'): tl, gl, cl, al = (len(lingpy.basictypes.lists(tokens).n), len(glosses), len(cogids), len(lingpy.basictypes.lists(alignment).n)) if tl != gl or tl != cl or gl != cl or al != gl or al != cl: errors.add('LENGTH: {0} {1} {2}'.format( idx, wl[idx, 'language'], wl[idx, 'concept'])) blacklist.add(idx) for token in tokens: if bipa[token].type == 'unknownsound': errors.add('SOUND: {0}'.format(token)) blacklist.add(idx) visited = set() for idx in wl: if wl[idx, 'concept'] not in concepts: if wl[idx, 'concept'] not in visited: args.log.warn('Missing concept {0}'.format(wl[idx, 'concept'])) visited.add(wl[idx, 'concept']) errors.add('CONCEPT {0}'.format(wl[idx, 'concept'])) elif wl[idx, 'doculect'] not in languages: if wl[idx, 'doculect'] not in visited: args.log.warn("Missing language {0}".format( wl[idx, 'doculect'])) visited.add(wl[idx, 'doculect']) errors.add('LANGUAGE {0}'.format(wl[idx, 'doculect'])) else: if ''.join(wl[idx, 'tokens']).strip() and idx not in blacklist: lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, 'doculect']], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'value'] or ''.join(wl[idx, 'tokens']), Form=wl[idx, 'form'] or ''.join(wl[idx, 'tokens']), Segments=wl[idx, 'tokens'], Morphemes=wl[idx, 'morphemes'], SimpleCognate=wl[idx, 'cogid'], PartialCognates=wl[idx, 'cogids'], Source=sources[wl[idx, 'doculect']], ) for gloss_index, cogid in enumerate(wl[idx, 'cogids']): args.writer.add_cognate(lexeme=lex, Cognateset_ID=cogid, Segment_Slice=gloss_index + 1, Alignment=alignments.get( (idx, gloss_index, cogid), ''), Alignment_Method='SCA') else: args.log.warn( 'Entry ID={0}, concept={1}, language={2} is empty'. format(idx, wl[idx, 'concept'], wl[idx, 'doculect'])) with open(self.dir.joinpath('errors.md'), 'w', encoding="utf-8") as f: f.write('# Error Analysis for TULED\n') for error in sorted(errors): f.write('* ' + error + '\n')
def cmd_makecldf(self, args): args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Name") # we combine with the manually edited wordlist to retrieve the lexeme # values wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix()) mapper = { (concept, language, normalize("NFD", form)): segments for (idx, concept, language, form, segments ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens') } data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False) for i, row in progressbar(enumerate(data[4:])): number = row[0].strip().strip('.') concept = row[1].strip() for j in range(0, len(row) - 2, 2): language = data[2][j + 2] value = row[j + 2] if value.strip() and value.strip() not in ['-----']: if ',' in row[j + 2]: forms = [v.strip() for v in value.split(',')] cogids = [ str(int(float(x))) for x in row[j + 3].split(' or ') ] else: forms = [value.strip()] cogids = [str(int(float(row[j + 3].split(' or ')[0])))] for form, cogid in zip(forms, cogids): try: segments = mapper[concept, languages[language], form] lexeme = args.writer.add_form_with_segments( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Segments=segments, Source="Deepadung2015") except: args.log.warn( 'lexeme missing {0} / {1} / {2}'.format( concept, language, form)) lexeme = args.writer.add_form( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Source="Deepadung2015") args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=cogid + '-' + number, Source="Deepadung2015")