class Dataset(BaseDataset): dir = Path(__file__).parent id = "mcelhanonhuon" form_spec = FormSpec(missing_data=("-", "")) def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ args.writer.add_sources() languages = args.writer.add_languages(lookup_factory=lambda l: l[ "ID"].lower() # lower case in raw data, so title case ) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") cog = CognateRenumber() for row in self.raw_dir.read_csv("mcelhanon-1967.tsv", dicts=True, delimiter="\t"): lex = args.writer.add_forms_from_value( Local_ID=row["ID"], Language_ID=languages[row["Language"]], Parameter_ID=concepts[row["Word"]], Value=row["Gloss"], Comment=row["Annotation"], Source="McElhanon1967", ) cognates = row["Cognacy"].split(",") if len(cognates) == 0: # singleton cog_id = cog.get_cogid() elif len(cognates) == 1: cog_id = cog.get_cogid(cognates[0]) else: raise ValueError( "Multiple cognates per lexeme are not handled") assert len(lex) == 1, "Should only have one lexeme" args.writer.add_cognate(lexeme=lex[0], Cognateset_ID=cog_id, Source="McElhanon1967")
class Dataset(BaseDataset): dir = Path(__file__).parent id = "deepadungpalaung" concept_class = CustomConcept language_class = CustomLanguage form_spec = FormSpec( separators=',', ) def cmd_makecldf(self, args): args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Name") # here we need to add the lexemes data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False) for i, row in progressbar(enumerate(data[4:])): number = row[0].strip().strip('.') for j in range(0, len(row)-2, 2): language = data[2][j+2] value = row[j+2] if value.strip() and value.strip() not in ['-----']: if not 'or' in row[3+j]: cogid = str(int(float(row[j+3]))) else: cogid = row[j+3].split()[0] for lexeme in args.writer.add_forms_from_value( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Source='Deepadung2015'): args.writer.add_cognate( lexeme=lexeme, Cognateset_ID=cogid+'-'+number, Source='Deepadung2015')
class Dataset(BaseDataset): dir = Path(__file__).parent id = "zhaobai" concept_class = CustomConcept language_class = CustomLanguage form_spec = FormSpec(separators=";/,") def cmd_makecldf(self, args): args.writer.add_sources() # TODO: add concepts with `add_concepts` args.writer.add_language( ID="ZhaozhuangBai", Glottocode="dali1242", ChineseName="趙莊白語", Name="Zhaozhuang Bai", Latitude=25.5844078, Longitude=100.3117, Family="Sino-Tibetan", DialectGroup="Southern Bai", ) for concept in self.conceptlists[0].concepts.values(): idx = concept.number + "_" + slug(concept.gloss) args.writer.add_concept( ID=idx, Name=concept.gloss, Chinese_Gloss=concept.attributes["chinese"], Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) args.writer.add_forms_from_value( Language_ID="ZhaozhuangBai", Parameter_ID=idx, Value=concept.attributes["form"], Source="Zhao2006", ) # We explicitly remove the ISO column since none of the languages in # this dataset have an ISO code. args.writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in args.writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ]
class Dataset(BaseDataset): dir = Path(__file__).parent id = "zgraggenmadang" language_class = CustomLanguage form_spec = FormSpec( missing_data=[ "-0̸-", "(ya)-", "xx kater", "Vb -0̸-", "-", "0̸", "0-", "?", "-", "- ", "0̸-", "_", "-0̸", ], replacements=[(" ", "_"), ("_+_give", "")], ) def cmd_download(self, args): pass def cmd_makecldf(self, args): args.writer.add_sources() languages = args.writer.add_languages(id_factory=lambda l: l["Name"], lookup_factory=lambda l: (l["Name"], l["Source"])) sources = {k[0]: k[1] for k in languages} # language: source map concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in progressbar( self.raw_dir.read_csv("madang.csv", dicts=True, delimiter="\t")): concept = CONCEPT_REMAPPING.get(row["CONCEPT"], row["CONCEPT"]) args.writer.add_forms_from_value( Local_ID=row["ID"], Language_ID=row["DOCULECT"], Parameter_ID=concepts[concept], Value=row["COUNTERPART"], Source=sources[row["DOCULECT"]], )
class Dataset(BaseDataset): dir = pathlib.Path(__file__).parent id = "aaleykusunda" language_class = CustomLanguage form_spec = FormSpec(separators="~;,/", missing_data=["∅"], first_form_only=True) def cmd_makecldf(self, args): # add bib args.writer.add_sources() args.log.info("added sources") # add concept concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name", ) # fix concept lookup concepts["the barley (Tibetan or highland)"] = concepts[ "the barley (tibetan or highland)"] concepts["to plant (vegetables, rice)"] = concepts[ "to plant (vegetals, rice)"] args.log.info("added concepts") # add language languages = args.writer.add_languages(lookup_factory="Name") args.log.info("added languages") # read in data data = self.raw_dir.read_csv("Kusunda_2019_250_lexical_items.tsv", delimiter="\t", dicts=True) # add data for entry in pb(data, desc="cldfify", total=len(data)): if entry["ENGLISH"] in concepts.keys(): for key, val in languages.items(): args.writer.add_forms_from_value( Language_ID=val, Parameter_ID=concepts[entry["ENGLISH"]], Value=entry[key], Source=["Bodt2019b"], )
class Dataset(BaseDataset): dir = pathlib.Path(__file__).parent id = "walworthpolynesian" form_spec = FormSpec(first_form_only=True) def cmd_makecldf(self, args): args.writer.add_sources(*self.raw_dir.read_bib()) languages = args.writer.add_languages( lookup_factory=lambda l: l['Name'] ) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name", ) concepts['ash'] = '146_ashes' wl = Wordlist(str(self.raw_dir / 'polynesian-aligned_22112018_corrected.tsv')) for idx in sorted(wl): wl[idx, 'segments'] = fix_segments(wl[idx, 'segments']) lex = args.writer.add_form_with_segments( Language_ID=languages.get(wl[idx, 'doculect']), Parameter_ID=concepts.get(wl[idx, 'concept']), Value=wl[idx, 'value'], Form=wl[idx, 'form'], Segments=[{'_': '+', "mh": "mʰ"}.get(x, x) for x in wl[idx, 'segments']], Source=[wl[idx, 'source']], Cognacy=wl[idx, 'cogid'], Loan=to_boolean(wl[idx, 'loan']), Comment=wl[idx, 'comment'] ) args.writer.add_cognate( lexeme=lex, Source=['walworth_mary_2018_1689909'], Cognateset_ID=wl[idx, 'cogid'] )
class Dataset(BaseDataset): dir = Path(__file__).parent id = "kinbank" language_class = CustomLanguage concept_class = CustomConcept form_spec = FormSpec(brackets={ "[": "]", "{": "}", "(": ")", "‘": "’" }, separators=";/,", missing_data=('?', '-', '', ''), strip_inside_brackets=True) def cmd_makecldf(self, args): languages = args.writer.add_languages(lookup_factory='Label') concepts = args.writer.add_concepts(id_factory=lambda c: c.id, lookup_factory="Parameter") for filename in sorted(self.raw_dir.glob("*/*.csv")): lang_id = languages[filename.stem] for row in self.raw_dir.read_csv(filename, dicts=True): concept_id = concepts.get(row['parameter'], row['parameter']) # default to IPA column if present otherwise use word column value = row['ipa'] if len(row['ipa']) else row['word'] if value: lex = args.writer.add_forms_from_value( Language_ID=lang_id, Parameter_ID=concept_id, Value=value, Comment=row['comment'], Source=row['source_bibtex'], ) args.writer.add_sources()
class Dataset(BaseDataset): dir = Path(__file__).parent id = "chindialectsurvey" # add your personalized data types here concept_class = CustomConcept language_class = CustomLanguage # define the way in which forms should be handled form_spec = FormSpec( brackets={ "(": ")", "[": "]" }, separators=";/,", missing_data=("?", "-"), strip_inside_brackets=True, ) def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ data = self.raw_dir.read_csv('wordlist.tsv', dicts=True, delimiter='\t') args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="ID") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in progressbar(data, desc="cldfify"): if row["DOCULECT"] in languages: args.writer.add_forms_from_value( Language_ID=row["DOCULECT"], Parameter_ID=concepts[row["CONCEPT"]], Value=row["TRANSCRIPTION"], Source=["chinds"], )
class Dataset(BaseDataset): dir = Path(__file__).parent id = "chingelong" language_class = CustomLanguage form_spec = FormSpec(missing_data=("---", ), separators="/", replacements=[(" ", "_")]) def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts = {} for concept in self.conceptlists[0].concepts.values(): cid = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=cid, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = cid languages = args.writer.add_languages(lookup_factory="Name_in_Source") args.writer.add_sources() for row in self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True): for language, lid in languages.items(): form = row[language].strip() if form and form != '---': args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=concepts[row['English']], Value=form, Source='Chin2015')
class Dataset(BaseDataset): dir = Path(__file__).parent id = "deepadungpalaung" concept_class = CustomConcept language_class = CustomLanguage form_spec = FormSpec(separators=',', ) def cmd_download(self, args): print('updating ...') with open(self.raw_dir.joinpath("deepadungpalaung.tsv"), "w", encoding="utf-8") as f: f.write(fetch("deepadungpalaung")) def cmd_makecldf(self, args): args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx languages = args.writer.add_languages(lookup_factory="Name") # we combine with the manually edited wordlist to retrieve the lexeme # values wl = Wordlist(self.raw_dir.joinpath('deepadungpalaung.tsv').as_posix()) mapper = { (concept, language, normalize("NFD", form)): segments for (idx, concept, language, form, segments ) in wl.iter_rows('concept', 'doculect', 'form', 'tokens') } data = self.raw_dir.read_csv('100item-phylo.Sheet1.csv', dicts=False) for i, row in progressbar(enumerate(data[4:])): number = row[0].strip().strip('.') concept = row[1].strip() for j in range(0, len(row) - 2, 2): language = data[2][j + 2] value = row[j + 2] if value.strip() and value.strip() not in ['-----']: if ',' in row[j + 2]: forms = [v.strip() for v in value.split(',')] cogids = [ str(int(float(x))) for x in row[j + 3].split(' or ') ] else: forms = [value.strip()] cogids = [str(int(float(row[j + 3].split(' or ')[0])))] for form, cogid in zip(forms, cogids): try: segments = mapper[concept, languages[language], form] lexeme = args.writer.add_form_with_segments( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Segments=segments, Source="Deepadung2015") except: args.log.warn( 'lexeme missing {0} / {1} / {2}'.format( concept, language, form)) lexeme = args.writer.add_form( Parameter_ID=concepts[number], Language_ID=languages[language], Value=value.strip(), Form=form, Source="Deepadung2015") args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=cogid + '-' + number, Source="Deepadung2015")
class Dataset(BaseDataset): dir = Path(__file__).parent id = "wichmannmixezoquean" language_class = CustomLanguage form_spec = FormSpec(brackets={ "(": ")", "[": "]" }, separators=",~", missing_data=("?", "-")) def cmd_makecldf(self, args): args.writer.add_sources() languages = args.writer.add_languages( lookup_factory=lambda l: l["Abbreviation"]) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") # add multiple forms concepts.update({ # note the mishmash of different dashes etc handled here. "hair - 1": "36_hair", "hair - 2": "36_hair", "see - 1": "72_see", "see - 2": "72_see", "stand - 1": "79_stand", "stand - 2": "79_stand", "stand -2": "79_stand", "walk/go - 1": "92_walkgo", "walk/go - 2": "92_walkgo", "worm - 1": "109_worm", "worm – 2": "109_worm", "worm - 2": "109_worm", }) sources = {l["Abbreviation"]: l["Source"] for l in self.languages} data = zip( self.raw_dir.read_csv("Wordlist.txt", delimiter="\t"), self.raw_dir.read_csv("Cognates.txt", delimiter="\t"), ) cogidx = 1 header = None for i, (row1, row2) in enumerate(data): if i == 0: header = row1[1:] else: concept_id = concepts[row1[0].strip()] for lang_abbrev, word, cog in zip(header, row1[1:], row2[1:]): if word.strip(): if cog.strip().lower() != "na": cogid = concept_id + "-" + cog else: cogid = str(cogidx) cogidx += 1 for row in args.writer.add_forms_from_value( Language_ID=languages[lang_abbrev], Parameter_ID=concept_id, Value=word, Source=sources[lang_abbrev], Cognacy=cogid, ): args.writer.add_cognate(lexeme=row, Cognateset_ID=cogid, Source="Cysouw2006a")
class Dataset(BaseDataset): id = "castrosui" dir = Path(__file__).parent concept_class = CustomConcept language_class = CustomLanguage form_spec = FormSpec(separators=",") def cmd_makecldf(self, args): wl = self.raw_dir.read_csv("wordlist.tsv", delimiter="\t") concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split('-')[-1] + '_' + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Chinese_Gloss=concept.attributes["chinese"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concept_lookup[concept.number.rjust(3, "0")] = [idx, concept] language_lookup = {k["ID_in_Source"]: k for k in self.languages} args.writer.add_languages() args.writer.add_sources() idx = 1 mapping = { 0: [ "doculect", "doculectid", "glottocode", "concept", "glossid", "value", "phonetic", "concepticon_id", "concepticon_gloss", ] } for line in progressbar(wl, desc="load the data"): if not line[0].strip(): phonetic = True if line[0] == "'Ref#": numbers = line phonetic = False idxs = defaultdict(list) elif line[0] == "Gloss": glosses = line elif line[0] in language_lookup and not phonetic: taxon = line[0] for num, gloss, val in zip(numbers[1:], glosses[1:], line[1:]): if num.strip() and gloss.strip(): cname = concept_lookup[num[1:]][1] if val: mapping[idx] = [ language_lookup[taxon]["Name"], taxon, language_lookup[taxon]["Glottocode"], cname.english, num[1:], val, "", # check later for phonetic value cname.concepticon_id, cname.concepticon_gloss, ] idxs[taxon, gloss] += [idx] idx += 1 elif line[0] in language_lookup and phonetic: taxon = line[0] for gloss, val in zip(glosses[1:], line[1:]): if gloss.strip(): these_idx = idxs.get((taxon, gloss)) if not these_idx: pass # export to lingpy wordlist in raw folder # Wordlist(mapping).output( # "tsv", filename=self.dir.joinpath("raw", "lingpy-wordlist").as_posix() # ) # add data to cldf for idx in progressbar(range(1, len(mapping)), desc="cldfify", total=len(mapping)): vals = dict(zip(mapping[0], mapping[idx])) args.writer.add_forms_from_value( Language_ID=language_lookup[vals["doculectid"]]["ID"], Parameter_ID=concept_lookup[vals["glossid"]][0], Value=vals["value"], Source=["Castro2015"], ) # We explicitly remove the ISO code column since the languages in # this datasets do not have an ISO code. args.writer.cldf["LanguageTable"].tableSchema.columns = [ col for col in args.writer.cldf["LanguageTable"].tableSchema.columns if col.name != "ISO639P3code" ]
class Dataset(BaseDataset): dir = Path(__file__).parent id = 'dunnaslian' form_spec = FormSpec(brackets={ "[": "]", "{": "}", "(": ")" }, separators=";/,", missing_data=('––', '--', '-'), strip_inside_brackets=True, replacements=[("…", "")]) def cmd_makecldf(self, args): args.writer.add_sources() args.writer.add_languages( id_factory=lambda l: l['ID'].replace("'", "")) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split('-')[-1] + '_' + slug(c.english), lookup_factory="Name") # empty lines do not play well with dicts=True, unfortunately so we do # it the hard way header = None for row in self.raw_dir.read_csv(DATAFILE, delimiter="\t"): if row[0] == '': continue # empty lines elif row[0] == 'WORD': header = row[1:] # remove column 1 so it synchronises below else: assert header is not None, "header should not be empty here!" # headers look like this: # WORD animal back bad belly big # i.e. have an empty column for cognates. # data rows look like this: # Ten'en_Palian ʔay 0 kaʁɔʔ 0 gbaʔ 3 ʔɛc 0 caw 6 # Ten'en_Paborn bsiŋ 6 kaʁɔʔ 0 baʔ 3 ʔec 0 ʔahaw 7 # # so we grab the languages in the first cell: lang = row.pop(0) # and then loop over each pair of columns (gloss, cognate) and join. for i in range(0, 10, 2): concept = concepts.get(header[i]) value = row[i].strip() # get cognacy cogs = COGNATE_OVERRIDES.get( value, [_.strip() for _ in row[i + 1].split(",")]) # skip empty forms if len(value) == 0: continue lex = args.writer.add_forms_from_value( Language_ID=lang.replace("'", ""), Parameter_ID=concept, Value=value, Source="DunnKruspeBurenhult2013", Loan=any([is_loan(c) for c in cogs if c])) # handle cognates if len(lex) == 0: continue # no lexeme, no cognate for i, cog in enumerate(cogs): if cog is None or len(cog) == 0 or is_loan(cog): # ignore empty cognates and loan words continue else: # if we have one lexeme, multiple cognates then add all the # cognates to lexeme[0]. These are: # wɔŋ ʔəhɔʔ = 9, 2 # kuʔ cɔʔ = 8, 4 # ɲɛ̤h ko̤h rao = 5, 8 # kəbɘʔ ploʔ = 0,1 # ... otherwise we should have one cognate # for each lexeme o = lex[i] if len(lex) == cog else lex[0] args.writer.add_cognate( lexeme=o, Cognateset_ID=concept + "_" + cog, Source="DunnKruspeBurenhult2013")
class Dataset(Base): dir = pathlib.Path(__file__).parent id = "wold" lexeme_class = WoldLexeme language_class = WoldLanguage concept_class = WoldConcept form_spec = FormSpec( separators="~,", first_form_only=True, brackets={}, # each language is different, need to do manually replacements=[ (" (1)", ""), (" (2)", ""), (" (3)", ""), (" (4)", ""), (" (5)", ""), (" (6)", ""), ("(f.)", ""), ("(1)", ""), ("(2)", ""), ("(3)", ""), ("(4)", ""), ("(5)", ""), ("(6)", ""), ("(2", ""), (" ", "_"), ], ) def cmd_makecldf(self, args): self._schema(args) args.writer.add_sources() # add the languages from the language file # NOTE: the source lists all languages, including proto-languages, # but the `forms` only include the first 41 in the list language_lookup = args.writer.add_languages(lookup_factory="WOLD_ID") desc_dir = self.cldf_dir / 'descriptions' if not desc_dir.exists(): desc_dir.mkdir() numentries = { r["pk"]: int(r["count_words"]) for r in self.raw_dir.joinpath("db").read_csv("vocabulary.csv", dicts=True) } db_contribs = { r['id']: r for r in self.raw_dir.joinpath('db').read_csv('contribution.csv', dicts=True)} for contrib in self.raw_dir.read_csv("contributions.csv", dicts=True): db_contrib = db_contribs[contrib['ID']] args.writer.objects["ContributionTable"].append( dict( ID=contrib["ID"], Name="{} vocabulary".format(contrib["Name"]), Citation=format_citation(contrib, numentries[contrib["ID"]]), Contributor=contrib["Contributors"], Number_of_words=numentries[contrib["ID"]], Language_ID=language_lookup[contrib["ID"]], ) ) desc = vocabulary_description( contrib['Name'], contrib["Contributors"], json.loads(db_contrib['jsondata'])) p = desc_dir.joinpath('vocabulary_{}.md'.format(contrib['ID'])) p.write_text(desc, encoding='utf8') concepticon = {concept.attributes['wold_id']: concept for concept in self.conceptlists[0].concepts.values()} for parameter in self.raw_dir.read_csv("parameters.csv", dicts=True): concept = concepticon.get(parameter['ID']) args.writer.add_concept( ID=parameter['ID'], Name=concept.english if concept else parameter['Name'], Concepticon_ID=concept.concepticon_id if concept else None, Concepticon_Gloss=concept.concepticon_gloss if concept else None, Core_list=parameter['CoreList'] == 'true', Semantic_field=parameter['SemanticField'], Semantic_category=parameter['SemanticCategory'], Borrowed_score=float(parameter['BorrowedScore']), Age_score=float(parameter['AgeScore']) if parameter['AgeScore'] else None, Simplicity_score=float(parameter['SimplicityScore']), ) form2lexeme = {} wid2fid = collections.defaultdict(set) lexemes_rows = self.raw_dir.read_csv("forms.csv", dicts=True) for row in progressbar(lexemes_rows): # Add information not in row, so we can pass to `add_form()` # with a single comprehension row["Language_ID"] = language_lookup[row["Language_ID"]] row["Parameter_ID"] = row["Parameter_ID"] row["Value"] = row.pop("Form") row["Loan"] = float(row["BorrowedScore"]) > 0.6 row["Borrowed_score"] = row["BorrowedScore"] row["Simplicity_score"] = row["SimplicityScore"] row["original_script"] = normalize_text(row["original_script"]) row["comment_on_borrowed"] = normalize_text(row["comment_on_borrowed"]) row.pop("Segments") row['Age_score'] = decimal.Decimal(row.pop('AgeScore')) if row['AgeScore'] else None row['Age'] = row.pop('age_label') row['Local_ID'] = row['ID'] row['contact_situation'] = row['ContactSituation'] row['Comment'] = row.pop('other_comments') lexemes = args.writer.add_forms_from_value( **{k: v for k, v in row.items() if k in self.lexeme_class.fieldnames()} ) assert len(lexemes) == 1 form2lexeme[row['ID']] = lexemes[0]['ID'] wid2fid[row['Word_ID']].add(lexemes[0]['ID']) words = {r['pk']: r for r in self.raw_dir.joinpath('db').read_csv('unit.csv', dicts=True)} languages = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('language.csv', dicts=True)} codes = {r['pk']: r['name'] for r in self.raw_dir.joinpath('db').read_csv('identifier.csv', dicts=True) if r['type'] == 'glottolog'} glottocodes = { r['language_pk']: codes[r['identifier_pk']] for r in self.raw_dir.joinpath('db').read_csv('languageidentifier.csv', dicts=True) if r['identifier_pk'] in codes} wids = [w['id'] for w in words.values()] for wid in wid2fid: assert wid in wids count = 0 for row in self.raw_dir.joinpath('db').read_csv('loan.csv', dicts=True): assert row['target_word_pk'] in words source_word = None if row ['source_word_pk']: assert row['source_word_pk'] in words source_word = words[row['source_word_pk']] twid = words[row['target_word_pk']]['id'] for fid in wid2fid[twid]: # The meaning-differentiated borrowing events. count += 1 args.writer.objects['BorrowingTable'].append(dict( ID=str(count), Target_Form_ID=fid, Comment='Source word unidentifiable' if source_word['name'].lower() == 'unidentifiable' else None, Source_word=None if source_word['name'].lower() == 'unidentifiable' else source_word['name'], Source_meaning=source_word['description'] or None, Source_languoid=languages[source_word['language_pk']], Source_languoid_glottocode=glottocodes.get(source_word['language_pk']), Source_relation=row['relation'], Source_certain=row['certain'] == 't', )) def _schema(self, args): args.writer.cldf['FormTable'].common_props['dc:description'] = \ "Word forms are listed as 'counterparts', i.e. as words with a specific meaning. " \ "Thus, words with multiple meanings may appear more than once in this table." args.writer.cldf['FormTable', 'Comment'].common_props['dc:description'] = \ "For more specific comments see 'comment_on_borrowed' and 'comment_on_word_form'" args.writer.cldf['FormTable', 'Word_ID'].valueUrl = URITemplate('https://wold.clld.org/word/{Word_ID}') args.writer.cldf.remove_columns('FormTable', 'Cognacy') t = args.writer.cldf.add_component( "ContributionTable", { "name": "Number_of_words", "datatype": "integer", "dc:description": "There would be 1814 words in each vocabulary, " "corresponding to the 1814 Loanword Typology meanings, if each meaning " "had exactly one counterpart, and if all the counterparts were " 'different words. But many ("polysomous") words are counterparts of ' "several meanings, many meanings have several word counterparts " '("synonyms", or "subcounterparts"), and many meanings have no ' "counterparts at all, so the number of words in each database varies " "considerably.", }, { "name": "Language_ID", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#languageReference", "dc:description": "References the language for which this contribution provides " "a vocabulary.", }, ) t.common_props['dc:description'] = \ "WOLD contributions are vocabularies (mini-dictionaries of about 1000-2000 entries) " \ "with comprehensive information about the loanword status of each word. " \ "Descriptions of how these vocabularies coded the data can be found in the " \ "[descriptions](descriptions/) directory." args.writer.cldf['ContributionTable', 'description'].valueUrl = URITemplate( './descriptions/vocabulary_{ID}.md') args.writer.cldf['ContributionTable', 'description'].common_props['dc:format'] = 'text/markdown' args.writer.cldf['ContributionTable', 'id'].common_props["dc:description"] = \ "The vocabulary ID number corresponds to the ordering to the chapters on the book " \ "Loanwords in the World's Languages. Languages are listed in rough geographical order " \ "from west to east, from Africa via Europe to Asia and the Americas, so that " \ "geographically adjacent languages are next to each other." args.writer.cldf['ContributionTable', 'citation'].common_props["dc:description"] = \ "Each vocabulary of WOLD is a separate electronic publication with a separate author " \ "or team of authors and should be cited as specified here." args.writer.cldf['ContributionTable', 'contributor'].common_props["dc:description"] = \ "The authors are experts of the language and its history. They also contributed a " \ "prose chapter on the borrowing situation in their language that was published in the " \ "book Loanwords in the World's Languages." t.add_foreign_key("Language_ID", "languages.csv", "ID") t = args.writer.cldf.add_component( 'BorrowingTable', { 'name': 'Source_relation', 'datatype': {'base': 'string', 'format': "immediate|earlier"}, 'dc:description': "Whether a word was contributed directly (immediate) or indirectly (earlier), " "i.e. via another, intermediate donor languoid, to the recipient language.", }, 'Source_word', 'Source_meaning', { 'name': 'Source_certain', 'datatype': {'base': 'boolean', 'format': "yes|no"}, 'dc:description': "Certainty of the source identification", }, { 'name': 'Source_languoid', 'dc:description': 'Donor languoid, specified as name of a language or language subgroup or family', }, { 'name': 'Source_languoid_glottocode', 'dc:description': 'Glottocode of the source languid', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#glottocode', } ) t.common_props['dc:description'] = \ 'While a lot of information about the borrowing status is attached to the borrowed ' \ 'forms, the BorrowingTable lists information about (potential) source words. Note ' \ 'that we list loan events per meaning; i.e. one loanword may result in multiple ' \ 'borrowings if the word has multiple meanings.'
class Dataset(BaseDataset): id = "transnewguineaorg" dir = Path(__file__).parent @staticmethod def get_slug_from_uri(uri): return [_ for _ in uri.split("/") if _][-1] form_spec = FormSpec( brackets={ "(": ")", "[": "]" }, separators=";/,|<", missing_data=("?", "-", "*", "---", "-BB:SRP", '*-', '*'), strip_inside_brackets=True, replacements=[ (" ", "_"), ('_+_modif.', ''), ('_+_verb', ''), ('_+_PL', ''), ('_+_mdf', ''), ('_+_mod', ''), ("_+_'make", ''), ("ɬ ̥", "ɬ̥"), ("l ̥", "l̥"), ('"', "'"), (" ?", ""), ("91)", ""), ("') :", ""), ("a ͥ", "aj"), ("<<̋>>"[2:-2], ""), (" ̟", ""), ], ) def cmd_makecldf(self, args): languages = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "languages.json") } words = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "words.json") } sources = { o["slug"]: o for o in self.raw_dir.read_json(self.raw_dir / "sources.json") } # handle sources # want to make sure that the bibtex key matches our source id. for source in sorted(sources): # this is ugly, I wish pybtex made this easier! bib = parse_string(sources[source]["bibtex"], "bibtex") old_key = list(bib.entries.keys())[0] bib.entries[old_key].key = source bib.entries = OrderedCaseInsensitiveDict([(source, bib.entries[old_key])]) args.writer.add_sources(bib) # handle languages for lang in sorted(languages): args.writer.add_language( ID=lang, Name=languages[lang]["fullname"], ISO639P3code=languages[lang]["isocode"], Glottocode=languages[lang]["glottocode"], ) # handle concepts concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.english] = idx concepts[concept.english.replace(" ", "-")] = idx concepts[concept.english.replace(" ", "-").lower()] = idx concepts[slug(concept.english)] = idx concepts["-".join([slug(x) for x in concept.english.split()])] = idx if '(' in concept.english: new_string = concept.english[:concept.english.index('(') - 1] concepts["-".join([slug(x) for x in new_string.split()])] = idx concepts[concept.english[:concept.english.index('(') - 1]] = idx concepts[concept.english[:concept.english.index('(') - 1].replace(' ', '-').lower()] = idx if concept.english.startswith("to "): new_string = concept.english[3:] concepts['-'.join([slug(x) for x in new_string.split()])] = idx concepts[concept.english.replace("to ", "")] = idx concepts["mans-mother-law"] = concepts["man's mother in law"] concepts["brother-law"] = concepts["brother in law"] concepts["to-make-hole"] = concepts["make hole (in ground)"] concepts["front"] = concepts["in front"] concepts["husk-nut"] = concepts["husk (of nut)"] concepts["his"] = concepts["his, hers, its (pronoun p:3s)"] concepts["we-two-incl"] = concepts[ "we incl. dual (pronoun d:1p, incl, dual)"] concepts["intrnasitivizer"] = concepts["intransitivizer"] concepts["short-piece-wood"] = concepts["short-piece-of-wood"] concepts["top-foot"] = concepts["top (of foot)"] concepts["sit-feet-and-legs-together"] = concepts[ "sit (with feet and legs together)"] concepts["earth"] = concepts["earth/soil"] concepts["warm"] = concepts["warm/hot"] concepts["your-sg"] = concepts["your (pronoun: p:2s)"] concepts["-law"] = concepts["in-law"] concepts["to-roast"] = concepts["roast"] concepts["arrow-barred"] = concepts[ "arrow (barred) (Arrow with cross bar)"] concepts["them-dual"] = concepts["them (pronoun o:3p, dual)"] concepts["you-dual"] = concepts["you (pronoun d:2s)"] concepts["right-correct"] = concepts["right (correct, true)"] concepts["betelpepper"] = concepts["betelpepper vine"] concepts["to-chop"] = concepts["to chop, cut down"] concepts["road"] = concepts["road/path"] concepts["for-benefactive-clitic"] = concepts[ "for (benefactive) ((cliticised or suffixed to noun))"] concepts["mans-father-law"] = concepts["mans' father in law"] concepts["sister-law"] = concepts["sister in law"] concepts["you-o2s"] = concepts["you (pronoun o:2s)"] concepts["you-pl-o2p"] = concepts["you pl. (pronoun o:2p)"] concepts["we-pl-incl"] = concepts["we incl. (pronoun d:1p, incl)"] concepts["in"] = concepts["in, inside"] concepts["not_know"] = concepts["not know"] concepts["their-dual"] = concepts["their (pronoun p:3p, dual)"] concepts["blow-fire"] = concepts["blow (on fire)"] concepts["blunt-eg-knife"] = concepts["blunt (of e.g. knife)"] concepts["our-dual"] = concepts["our (two) (pronoun p:1p, dual)"] concepts["your-pl-dual"] = concepts[ "your (two) pl (pronoun p:2p, dual)"] concepts["suck-breast"] = concepts["to suck at breast"] concepts["draw-water-carry"] = concepts["draw water / carry"] concepts["tree-sp-Gnetum-gnemon"] = concepts[ "tree sp. (Gnetum gnemon)"] concepts["he-she"] = concepts["he, she, it, that, those"] concepts["fed"] = concepts["fed up (with)"] concepts["you-pl-dual-o2p"] = concepts[ "you plural two (pronoun d:2p, dual)"] concepts["you-pl-dual"] = concepts["you two (pronoun d:2s, dual)"] concepts["to-put"] = concepts["to put, give"] concepts["he-she-it-those"] = concepts["he, she, it, that, those"] concepts["we-two-excl"] = concepts[ "we excl. dual (pronoun d:1p, excl, dual)"] concepts["we-pl-excl"] = concepts[ "we excl. plural (pronoun d:1p, excl, plural)"] #concepts["affix-body-part"] = concepts[""] itemfiles = [ f for f in self.raw_dir.iterdir() if f.name.startswith("language-") ] errors = set() for filename in progressbar(sorted(itemfiles), desc="adding lexemes"): for o in sorted(self.raw_dir.read_json(filename), key=lambda d: d["id"]): wordid = self.get_slug_from_uri(o['word']) if wordid in concepts: args.writer.add_forms_from_value( Local_ID=o["id"], Language_ID=self.get_slug_from_uri(o["language"]), Parameter_ID=concepts[wordid], Value=o["entry"], Source=self.get_slug_from_uri(o["source"]), Comment=o["annotation"], ) else: errors.add(("concept", wordid)) for error in errors: args.log.info("error with {0[0]}: {0[1]}".format(error)) args.log.info("found {0} errors in concepts".format(len(errors))) def get_all(self, url): """Helper function to iterate across the API's _next_ commands for a given URL""" while True: j = get_url(url).json() yield j["objects"] if not j["meta"]["next"]: break url = BASE_URL + j["meta"]["next"] def cmd_download(self, args): if not self.raw_dir.exists(): self.raw_dir.mkdir() for fname in self.raw_dir.iterdir(): remove(fname) # sources sources = [] for j in self.get_all(SOURCES_URL % {"limit": LIMIT}): sources.extend(j) jsondump(sources, self.raw_dir / "sources.json", args.log) # languages languages = [] for j in self.get_all(LANGUAGES_URL % {"limit": LIMIT}): languages.extend(j) jsondump(languages, self.raw_dir / "languages.json", args.log) # words words = [] for j in self.get_all(WORDS_URL % {"limit": LIMIT}): words.extend(j) jsondump(words, self.raw_dir / "words.json", args.log) # items for language in languages: items = [] for j in self.get_all(RECORDS_URL % { "limit": LIMIT, "language": language["id"] }): items.extend(j) jsondump( items, self.raw_dir / ("language-%d.json" % language["id"]), args.log, ) # version information with open(self.raw_dir / "version.txt", "w") as handle: handle.write(str(datetime.now()))
class Dataset(BaseDataset): dir = pathlib.Path(__file__).parent id = "huntergatherer" lexeme_class = HGLexeme concept_class = HGConcept form_spec = FormSpec(missing_data=("?", "[missing]", "missing", "#NAME?", "X", "[absent]", "-", "--", "...")) def _get(self, path, log): with self.raw_dir.temp_download(self.metadata.url + path, ".html", log) as fname: return BeautifulSoup(fname.read_text(encoding="utf8"), "html.parser") def cmd_download(self, args): for a in self._get("/languages", args.log).find_all("a", href=True): if a["href"].startswith("/languages/language/"): parse(self._get(a["href"], args.log), a["href"].split("/")[-1], self.raw_dir) @staticmethod def get_tokenizer(): return lambda x, y: ipa2tokens(y, merge_vowels=False) def cmd_makecldf(self, args): concepts = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Database_ID", ) language_map = { lang["ID"]: lang["Glottocode"] or None for lang in self.languages } sources = {} for path in sorted(self.raw_dir.glob("*.json"), key=lambda _p: int(_p.stem)): data = jsonlib.load(path) iso = data.get("ISO 639-3") if iso: iso = iso.strip() args.writer.add_language( ID=data["id"], Name=data["name"], ISO639P3code=iso if iso not in {"no", "XXX"} else None, Glottocode=language_map[data["id"]], ) for table in ["basic", "flora", "cult"]: if table not in data["tables"]: continue for item in data["tables"][table]["rows"]: item = dict(zip(data["tables"][table]["header"], item)) form = item["Orthographic Form"].strip() if form: refs = [ ref for ref in itersources(item, data, sources) if ref ] args.writer.add_sources(*[ref.source for ref in refs]) href, _ = item["English"] concept_database_id = href.split("/")[-1] if not concepts.get(concept_database_id): # https://huntergatherer.la.utexas.edu/lexical/feature/729 # is missing from the concept list(s) continue args.writer.add_lexemes( Language_ID=data["id"], Parameter_ID=concepts[concept_database_id], Value=form, Loan=bool(item["Loan Source"] or item["Wanderwort Status"]), Phonemic=item["Phonemicized Form"] or None, Source=["%s" % ref for ref in refs], Creator=item.get("Created By"), Comment=item.get("General Notes"), )
class Dataset(BaseDataset): dir = Path(__file__).parent id = "gaotb" language_class = CustomLanguage concept_class = CustomConcept form_spec = FormSpec( missing_data=("---",), separators="/;", replacements=[ (" ", "_"), ('\u0306', ''), ('\u0329', ''), ('\u0303', ''), ('\u0325', ''), ('\u0335', ''), ('\u0331', '')], first_form_only=True, ) def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts = {} for concept in self.conceptlists[0].concepts.values(): cid = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=cid, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Number=concept.number ) concepts[concept.number] = cid args.log.info('[i] added concepts') languages = args.writer.add_languages(lookup_factory="Number") args.log.info('[i] added languages') args.writer.add_sources() missingL, missingC = set(), set() missingCog = set() cogids = {} for row in progressbar( self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True)): lid = languages.get(row['LANGUAGE']) cid = concepts.get(row['SID']) # take only the first cognate ID if there are several cog = row['COGNATE'].split('|')[0] if lid and cid and row["FORM"] and row["FORM"].strip(): lexemes = args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=cid, Value=row["FORM"], Source='Sun1991' ) if cog.strip(): cogid = cid+'-'+cog args.writer.add_cognate( lexeme=lexemes[0], Cognateset_ID=cogid, Cognate_Detection_Method='expert', Source='Gao2020' ) else: missingCog.add(cogid) if not lid: missingL.add(lid) if not cid: missingC.add(cid) for entry in missingL: print('missing L {0}'.format(entry)) for entry in missingC: print('missing C {0}'.format(entry)) for entry in missingCog: print('missing Cognate {0}'.format(entry))
class Dataset(BaseDataset): dir = Path(__file__).parent id = "pharaocoracholaztecan" concept_class = CustomConcept form_spec = FormSpec( separators="/", first_form_only=False, brackets={"’": "’", "(": ")"}, replacements=[("*", ""), (" ", "_")], ) def cmd_makecldf(self, args): # parse the data from the word document table = [[""]] # we except 9 columns with open(self.raw_dir.joinpath("data.txt").as_posix()) as f: previous = [] for i, line in enumerate(f): rows = [c.strip() for c in line.split("\t")] if rows[0].replace(".", "").isdigit(): table += [rows] else: table[-1][-1] += "/" + rows[0] table[-1] += rows[1:] # load cognates cognates = self.raw_dir.read_csv("cognates.tsv", delimiter="\t")[1:] concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = "{0}-{1}".format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Spanish_Gloss=concept.attributes["spanish"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) for gloss in concept.attributes["lexibank_gloss"]: concepts[gloss] = idx concepts["Frio/(hace frio)"] = concepts["Frio (hace frio)"] args.log.info("added concepts") args.writer.add_sources() cognacy, counter = {}, 1 cogsets = { "A(B)": ["A"], "A/(B)": ["A"], "A/B": ["A", "B"], "A/B/C": ["A", "B", "C"], "A/B/D": ["A", "B", "D"], "A/B?": ["A"], "A/C": ["A", "C"], "B/(A)": ["A"], "B/(a)": ["B"], "B/C": ["B", "C"], "C D": ["C", "D"], "C/(B)": ["C"], "C/B": ["C", "B"], "C/E": ["C", "E"], "D/B": ["D", "B"], "a/(B)": ["A"], "a/A": ["A", "A"], "a/B": ["A", "B"], "ab": ["A", "B"], } languages = args.writer.add_languages(lookup_factory="Name") for i, line in progressbar(enumerate(table[1:])): for j, (language, cell) in enumerate(zip(table[0][2:], line[2:])): if cell.strip(): cognatesets = cogsets.get( cognates[i][j + 1].strip(), [cognates[i][j + 1].strip().upper()] ) for lexeme, cognate in zip( args.writer.add_forms_from_value( Value=cell, Language_ID=languages[language], Parameter_ID=concepts[line[1]], Source=["Pharao2020"], ), cognatesets, ): if cognate in ["?", "-"]: cid = counter counter += 1 else: cid = "{0}-{1}".format(i, cognate) if cid in cognacy: cid = cognacy[cid] else: cognacy[cid] = counter cid = cognacy[cid] counter += 1 if languages[language] == "ProtoUtoAztecan" and "SUA" in cell.strip(): lexeme["Language_ID"] = languages["SUA"] args.writer.add_cognate(lexeme, Cognateset_ID=cid, Source=["Pharao2020"])
class Dataset(BaseDataset): id = "tppsr" dir = Path(__file__).parent concept_class = CustomConcept language_class = CustomLanguage lexeme_class = CustomLexeme form_spec = FormSpec(first_form_only=True, missing_data=("#NAME?", ), replacements=[("- - ", "-"), (" - ", "-"), ("- ", "-"), (" -", "")]) def cmd_makecldf(self, args): args.writer.add_sources() # We can link forms to scans of the page in the source where they appear: args.writer.cldf["FormTable", "Scan"].valueUrl = URITemplate( 'https://cdstar.shh.mpg.de/bitstreams/{Objid}/gauchat_et_al_1925_tppsr_{Scan}.png' ) for c in ['Population', 'SpeakerAge']: args.writer.cldf['LanguageTable', c].datatype.base = 'integer' args.writer.cldf['LanguageTable', c].datatype.minimum = 0 values = self.raw_dir.read_csv('tppsr-db-v20.txt', delimiter='\t') forms = self.raw_dir.read_csv('tppsr-db-v20-ipa-narrow.txt', delimiter='\t') concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.id, slug(concept.attributes['french'])) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, French_Gloss=concept.attributes['french'], Latin_Gloss=concept.attributes['latin'], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.number] = (idx, concept.attributes['page'], concept.attributes['french']) languages = args.writer.add_languages(lookup_factory='Number') def scan_number(bitstreams): p = re.compile(r'tppsr_(?P<number>[0-9]{4})\.png') for bs in bitstreams: m = p.search(bs['bitstreamid']) if m: return m.group('number') scans = { scan_number(o['bitstreams']): objid for objid, o in self.raw_dir.read_json('tppsr_scans.json').items() } phrase_data = collections.defaultdict(dict) for row1, row2 in progressbar(zip(values, forms), desc='cldfify'): entry = row1[2] for s, t in [('\u0320', '')]: entry = entry.replace(s, t) tokens = self.tokenizer({}, entry.strip().replace(' ', '_'), column='IPA') # Compute scan number from concept number and language number. page = int(concepts[row1[0]][1]) + int(int(row1[1]) > 31) scan = str(page + 18).rjust(4, '0') if row1[2].replace('_', '').replace('-', '').strip(): phrase_data[row1[1]][row1[0]] = (row2[2], row1[2]) args.writer.add_form_with_segments( Value=row1[2], Form=''.join(tokens), Segments=tokens, Profile=' '.join( self.tokenizer({}, entry.strip(), column='Grapheme')), Source=['Gauchat1925[{0}]'.format(page)], Language_ID=languages[row1[1]], Parameter_ID=concepts[row1[0]][0], Scan=scan, Objid=scans[scan], ProsodicStructure=prosodic_string(tokens, _output='CcV'), SegmentedValue=' '.join( self.tokenizer({}, entry, column='Graphemes'))) args.writer.cldf.add_component( 'ExampleTable', 'Alt_Transcription', { "name": "Concept_ID", "separator": " ", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#parameterReference", }, { "name": "Form_ID", "separator": " ", "propertyUrl": "http://cldf.clld.org/v1.0/terms.rdf#formReference", }, ) args.writer.cldf.add_foreign_key('ExampleTable', 'Concept_ID', 'ParameterTable', 'ID') args.writer.cldf.add_foreign_key('ExampleTable', 'Form_ID', 'FormTable', 'ID') for phrase in self.etc_dir.read_csv('phrases.csv', dicts=True): for lid, data in sorted(phrase_data.items(), key=lambda i: i[0]): lid = languages[lid] cids = phrase['Concepts'].split() try: args.writer.objects['ExampleTable'].append( dict( ID='{}-{}'.format(phrase['ID'], lid), Language_ID=lid, Primary_Text=' '.join( [data[cid][0] for cid in cids]), Translated_Text=' '.join( [concepts[cid][2] for cid in cids]), Alt_Transcription=' '.join( [data[cid][1] for cid in cids]), Concept_ID=[concepts[cid][0] for cid in cids], Form_ID=[ '{}-{}-1'.format(lid, concepts[cid][0]) for cid in cids ], )) except KeyError: pass
class Dataset(BaseDataset): dir = Path(__file__).parent id = "peirosaustroasiatic" language_class = CustomLanguage lexeme_class = CustomLexeme cross_concept_cognates = True form_spec = FormSpec(separators=("/", ","), strip_inside_brackets=True, brackets={ "[": "]", "(": ")", "<": ">" }) def cmd_makecldf(self, args): # add sources args.writer.add_sources() # add concepts concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") # fix concept concepts["fat (n.)"] = concepts["fat n."] concepts["burn (tr.)"] = concepts["burn tr."] concepts["to fly"] = concepts["fly v."] concepts["lie (down)"] = concepts["lie"] concepts["walk (go)"] = concepts["walk(go)"] args.log.info("added concepts") # add languages languages = {} for language in self.languages: args.writer.add_language(**language) languages[language["Name"]] = language["ID"] args.log.info("added languages") # add data for row_ in progressbar( self.raw_dir.read_csv("Peiros2004-data by etymology.txt", delimiter="\t")): if "".join(row_).strip(): row = dict( zip(["CONCEPT", "SUBGROUP", "LANGUAGE", "FORM", "COGNACY"], row_)) bsource = "" if row["COGNACY"].isdigit(): cogid = int(row["COGNACY"]) elif row["COGNACY"].startswith("<"): bsource = row["COGNACY"].split(" ")[1] cogid = 0 else: cogid = 0 for lexeme in args.writer.add_forms_from_value( Parameter_ID=concepts[re.sub("'", "", row["CONCEPT"])], Language_ID=languages[row["LANGUAGE"].strip()], Value=row["FORM"].strip(), Source=["Peiros2004a"], LoanSource=bsource, Loan=True if bsource else False, ): args.writer.add_cognate(lexeme, Cognateset_ID=cogid, Source=["Peiros2004a"])
class Dataset(BaseDataset): id = "davletshinaztecan" dir = Path(__file__).parent concept_class = CustomConcept language_class = CustomLanguage form_spec = FormSpec( missing_data=["*", "---", "-"], separators=";/,~", strip_inside_brackets=True, replacements=[(" ", "_")], brackets={"(": ")"}, first_form_only=True, ) def cmd_makecldf(self, args): # Add bibliographic sources and collect them args.writer.add_sources() sources, languages = {}, {} for language in self.languages: sources[language["NameInData"]] = language["Source"] languages[language["NameInData"]] = language["ID"] args.writer.add_language(**language) # Add concepts and collecte them concepts, proto = {}, {} for concept in self.conceptlists[0].concepts.values(): idx = "{0}_{1}".format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, ProtoAztecan=concept.attributes["proto_aztecan"], Number=concept.number, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.number] = idx proto[concept.number] = concept.attributes['proto_aztecan'] cogidx = 0 with open(self.raw_dir.joinpath("data.txt").as_posix()) as f: for line in progressbar(f, desc="cldfify"): number, concept = line.split(" :: ")[0].split(". ") entries = re.split(r"(\(-*[0-9]\))[,\.]*", line.split(" :: ")[1]) cogids, count, borrowing = [], 0, False for i in range(0, len(entries) - 1, 2): entry = entries[i].strip() cogid = int(entries[i + 1][1:-1]) if cogid < 0: borrowing = True cogid = len(entries) + count count += 1 language = entry.split(" ")[0] value = " ".join(entry.split(" ")[1:]) for lex in args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[number], Value=value, Source=[sources[language]], Loan=borrowing, ): args.writer.add_cognate( lexeme=lex, Cognateset_ID=cogid + cogidx, Source="Davletshin2012", ) cogids += [cogid] # add proto-aztecan form if proto[number].strip() != "?": for lex in args.writer.add_forms_from_value( Language_ID=languages["PA"], Parameter_ID=concepts[number], Value=proto[number], Source=sources["PA"], ): args.writer.add_cognate( lexeme=lex, Cognateset_ID=sorted( cogids, key=lambda x: cogids.count(x), reverse=True, )[0] + cogidx, ) cogids += [cogid] cogidx += max(cogids)
class Dataset(BaseDataset): dir = Path(__file__).parent id = "holmie" language_class = CustomLanguage concept_class = CustomConcept form_spec = FormSpec( missing_data=("-", ), separators="/,;", replacements=[(" ", "_")], strip_inside_brackets=False, first_form_only=True, brackets={}, ) def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ concepts, wl_concepts = {}, {} visited = set() for concept in self.concepts: cid = '{0}_{1}'.format(concept['NUMBER'], slug(concept['ENGLISH'])) if cid in visited: pass else: visited.add(cid) args.writer.add_concept( ID=cid, Name=concept['ENGLISH'], Glosses_in_Source=concept['GLOSSES_IN_SOURCE'], Concepticon_ID=concept['CONCEPTICON_ID'], Concepticon_Gloss=concept['CONCEPTICON_GLOSS']) for gloss in concept['GLOSSES_IN_SOURCE'].split(' // '): concepts[gloss] = cid wl_concepts[gloss] = concept['ENGLISH'] languages = args.writer.add_languages(lookup_factory="Name_in_Source") args.writer.add_sources() # make a wordlist for edictor to inspect the data D = {0: ['doculect', 'concept', 'ipa', 'cogid']} idx = 1 for i, row in progressbar( enumerate( self.raw_dir.read_csv('data.tsv', delimiter='\t', dicts=True))): for language, lid in languages.items(): form = row[language].strip() if form: lexemes = args.writer.add_forms_from_value( Language_ID=lid, Parameter_ID=concepts[row['Meaning']], Value=form, Source='Holm2017') if lexemes: args.writer.add_cognate( lexeme=lexemes[0], Cognateset_ID=str(i + 1), Cognate_Detection_Method='expert', Source='Holm2017') D[idx] = [ language, wl_concepts[row['Meaning']], form, i + 1 ] idx += 1 Wordlist(D).output( 'tsv', filename=self.raw_dir.joinpath('wordlist').as_posix())
class Dataset(BaseDataset): """ Defines the dataset for Lieberherr and Bodt (2017). """ id = "lieberherrkhobwa" dir = Path(__file__).parent language_class = KBLanguage form_spec = FormSpec(separators="~/,;ткд", missing_data=("NA", )) def cmd_download(self, **kw): """ Download the raw zipped data and extract it. """ zip_url = ( "https://zenodo.org/api/files/5469d550-938a-4dae-b6d9-50e427f193b3/" "metroxylon/subgrouping-kho-bwa-v1.0.0.zip") self.raw_dir.download(zip_url, "kho-bwa-v1.0.0.zip") def cmd_makecldf(self, args): # Add bibliographic sources args.writer.add_sources() # Read raw concept data and add to dataset; at the same time, # build a map between the concept index as used in data and the # concept id in the dataset concept_lookup = {} for cidx, concept in enumerate(self.conceptlists[0].concepts.values()): concept_cldf_id = (concept.id.split("-")[-1] + "_" + slug(concept.english)) concept_lookup[1 + (cidx * 2)] = concept_cldf_id # Add the concept args.writer.add_concept( ID=concept_cldf_id, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) # Add languages and make a map for individual sources language_lookup = args.writer.add_languages( lookup_factory="Source_Name") source_lookup = { entry["Source_Name"]: entry["Source"] for entry in self.languages } # Read raw data and remove headers and rows with reconstructions # (row[0] not in languages) data = self.raw_dir.read_csv("dataset_khobwa.csv") data = data[2:] data = [row for row in data if row[0] in language_lookup] # iterate over the source adding lexemes and collecting cognates for row in progressbar(data, desc="makecldf"): for cid in range(1, len(row), 2): # Skip over rows with empty fields for cogid if not row[cid + 1]: continue # Compute a cognate_id number; lingpy now requires # this to be an integer cognate_id = cid * 100 + int(row[cid + 1]) # Extract the value from the raw data, skipping over # missing or non-existing forms. We need to strip here, # as there are entries with newlines and FormSpec, as the # name implies, does not apply to values. value = row[cid].strip() for lex in args.writer.add_lexemes( Language_ID=language_lookup[row[0]], Parameter_ID=concept_lookup[cid], Value=value, Cognacy=cognate_id, Source=source_lookup[row[0]], ): args.writer.add_cognate( lexeme=lex, Cognateset_ID=cognate_id, Source="Lieberherr2017", )