def cmd_makecldf(self, args): wl = lingpy.Wordlist(str(self.raw_dir / 'D_subset-300-22.tsv')) args.writer.add_sources() args.writer.add_languages(id_factory='Name') source_lookup = { language['ID']: language['Source'] for language in self.languages } concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.number + '_' + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss) concepts[concept.english] = idx for k in pb(wl, desc='wl-to-cldf', total=len(wl)): if wl[k, 'tokens']: args.writer.add_form( Language_ID=wl[k, 'doculect'], Parameter_ID=concepts[wl[k, 'concept']], Value=wl[k, 'ipa'].strip() or ''.join(wl[k, 'tokens']), Form=wl[k, 'ipa'].strip().replace(' ', '_') or ''.join(wl[k, 'tokens']), Source=[source_lookup[wl[k, 'doculect']]], Comment=wl[k, 'note'])
def cmd_makecldf(self, args): concepts = {} wl = lp.Wordlist( self.raw_dir.joinpath('D_test_Bahnaric-200-24.tsv').as_posix()) for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = idx #concepts['burn'] = concepts['burn tr.'] #concepts['claw'] = concepts['claw (nail)'] #concepts['come (V)'] = concepts['come'] #concepts['die (V)'] = concepts['die'] #concepts['drink (V)'] = concepts['drink'] #concepts['eat (V)'] = concepts['eat'] #concepts['fat'] = concepts['fat n.'] #concepts['fly'] = concepts['fly v.'] #concepts['give (V)'] = concepts['give'] #concepts['hear (V)'] = concepts['hear'] #concepts['kill (V)'] = concepts['kill'] #concepts['know (V)'] = concepts['know'] #concepts['lie (V)'] = concepts['lie'] #concepts['rain (V)'] = concepts['rain'] #concepts['say (V)'] = concepts['say'] #concepts['see (V)'] = concepts['see'] #concepts['sit (V)'] = concepts['sit'] #concepts['sleep (V)'] = concepts['sleep'] #concepts['stand (V)'] = concepts['stand'] #concepts['swim (V)'] = concepts['swim'] #concepts['walk (V)'] = concepts['walk(go)'] languages = args.writer.add_languages( lookup_factory="Name", id_factory=lambda x: slug(x['Name'])) args.writer.add_sources() visited = set() for idx, concept in wl.iter_rows('concept'): if wl[idx, 'concept'] in concepts: lexeme = args.writer.add_form( Language_ID=languages[wl[idx, 'language']], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'ipa'], Form='.'.join(wl[idx, 'tokens']), Source='Sidwell2015', Loan=True if wl[idx, 'cogid'] < 0 else False) args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=wl[idx, 'cogid'], Cognate_Detection_Method='expert', Source=['Sidwell2015']) else: if concept not in visited: visited.add(concept) print(concept)
def cmd_makecldf(self, args): """ Convert the raw data to a CLDF dataset. """ wl = lingpy.Wordlist(self.raw_dir.joinpath("GEM-CNL.csv").as_posix()) concepts = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name" ) for concept in self.conceptlists[0].concepts.values(): for cis in concept.attributes["lexibank_gloss"]: if cis not in concepts: concepts[cis] = concepts[concept.english] languages = args.writer.add_languages(lookup_factory="STEDT_Name") args.writer.add_sources() for idx, language, concept, value, pos in wl.iter_rows( "doculect", "concept", "reflex", "gfn" ): # Fix for 251479 if concept == "top (i.e. highest point": concept = "top (i.e. highest point)" if concept not in concepts: args.log.warning(concept) else: args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[concept], Value=value, Source=["Marrison1967"], )
def cmd_makecldf(self, args): concepts = {} wl = lp.Wordlist(self.raw_dir.joinpath('IDS.csv').as_posix()) for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.attributes['ids_id'].replace( '-', '.').strip('0')] = idx languages = args.writer.add_languages( lookup_factory="Name", id_factory=lambda x: slug(x['Name'])) args.writer.add_sources() for idx in wl: lexeme = args.writer.add_form( Language_ID=languages[wl[idx, 'language']], Parameter_ID=concepts[wl[idx, 'ids_id']], Value=wl[idx, 'ortho'], Form=wl[idx, 'ipa'].replace('#', '-'), Source='List2014c', Loan=True if wl[idx, 'cogid'] < 0 else False) args.writer.add_cognate(lexeme=lexeme, Cognateset_ID=wl[idx, 'cogid'], Cognate_Detection_Method='expert', Source=['List2014c'])
def cldf(dataset, concepticon, **kw): for dset, srckey in zip(DSETS, SOURCES): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'), dataset, subset=dset.split('-')[0]) as ds: ds.sources.add(src) for k in wl: ds.add_row([ '{0}-{1}'.format(srckey, k), wl[k, 'glottolog'], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']), ' '.join(wl[k, 'clpa']), wl[k, 'cogid'], ' '.join([str(x) for x in wl[k, 'partialids']]) ]) cognates = [] for k in wl: concept = wl[k, 'concept'] idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']]) cognates += [[ '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf, '', 'expert', srckey, '', '', '' ]] dataset.cognates.extend( iter_alignments(wl, cognates, method='progressive', prefix=srckey + '-'))
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("yi-wl.tsv").as_posix()) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, Chinese_Gloss=concept.attributes["chinese"], ) concepts[concept.english] = idx concepts["Daughter-in-law"] = concepts["daughter-in-law"] for idx in pylexibank.progressbar(wl, desc="cldfify", total=len(wl)): args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "tokens"], Source=["Castro2010"], )
def add_doculect(self, doculect, values): """ Add a new column (like a new doculect or the like) to the data. NOTES ----- For the moment, we assume that we are dealing with doculects and concepts, which may be changed later on... """ # get an index for all the values in values converter = { value: {self[k, 'concept']: self[k, value] for k in self} for value in values } # now, create the wordlist d = {0: ['doculect', 'concept'] + values} for idx, k in enumerate(self.concepts, start=1): d[idx] = [doculect, k] + [converter[value][k] for value in values] wl = lingpy.Wordlist(d) self.add_data(wl) print('Successfully added new doculect template for {0}'.format( doculect))
def cmd_makecldf(self, args): args.writer.add_sources() wl = lingpy.Wordlist(self.dir.joinpath("raw", "wordlist.tsv").as_posix()) concepts = {} strip_concept = lambda x: x.replace(" ", "").replace("*", "") for concept in self.conceptlists[0].concepts.values(): args.writer.add_concept( ID=concept.id, Name=concept.english, Chinese_Gloss=strip_concept(concept.attributes["chinese"]), Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[strip_concept(concept.attributes["chinese"])] = concept.id langs = {k["ChineseName"]: k["ID"] for k in self.languages} args.writer.add_languages() for idx in pylexibank.progressbar(wl, desc="cldfify"): args.writer.add_form_with_segments( Language_ID=langs[wl[idx, "doculect"]], Parameter_ID=concepts[strip_concept(wl[idx, "concept"])], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "tokens"], Source=["Castro2010a"], )
def cmd_install(self, **kw): wl = lingpy.Wordlist(self.raw.posix('Bruzzi_Granadillo.txt')) with self.cldf as ds: ds.add_sources(*self.raw.read_bib()) for k in pb(wl, desc='wl-to-cldf'): ds.add_language(ID=slug(wl[k, 'doculect']), Name=wl[k, 'doculect'], Glottocode='bani1255') ds.add_concept(ID=slug(wl[k, 'concept']), Name=wl[k, 'concept'], Concepticon_ID=wl[k, 'concepticon_id'] or '', Portuguese_Gloss=wl[k, 'concept_portuguese']) for row in ds.add_lexemes(Language_ID=slug(wl[k, 'doculect']), Parameter_ID=slug(wl[k, 'concept']), Value=wl[k, 'entrj_in_source'], Form=wl[k, 'ipa'], Segments=wl[k, 'tokens'], Source=[ 'granadillo_ethnographic_2006', 'silva_discoteca_1961' ]): cid = slug(wl[k, 'concept'] + '-' + '{0}'.format(wl[k, 'cogid'])) ds.add_cognate(lexeme=row, Cognateset_ID=cid, Source=['Chacon2018'], Alignment=wl[k, 'alignment'], Alignment_Source='Chacon2018')
def fetch( dataset, remote_dbase=None, concepts=None, languages=None, columns=None, to_lingpy=None, transform=None, base_url="http://lingulist.de/edictor", ): url = base_url + "/triples/get_data.py?file=" + dataset if not remote_dbase: url += "&remote_dbase=" + dataset + ".sqlite3" if concepts: url += "&concepts=" + "|".join( [urllib.parse.quote(c) for c in concepts]) if languages: url += "&doculects=" + "|".join( [urllib.parse.quote(c) for c in languages]) if columns: url += "&columns=" + "|".join(columns) data = urlopen(url).read() if to_lingpy: with tempfile.NamedTemporaryFile() as tf: tf.write(data) tf.flush() return transform(tf.name) if transform else lingpy.Wordlist( tf.name) return data.decode("utf-8")
def cmd_makecldf(self, args): concepts = {} wl = lp.Wordlist(self.raw_dir.joinpath("OUG.csv").as_posix()) for concept in self.conceptlists[0].concepts.values(): idx = "{0}_{1}".format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = idx concepts["bite (V)"] = concepts["bite"] concepts["burn (V)"] = concepts["burn tr."] concepts["claw"] = concepts["claw (nail)"] concepts["come (V)"] = concepts["come"] concepts["die (V)"] = concepts["die"] concepts["drink (V)"] = concepts["drink"] concepts["eat (V)"] = concepts["eat"] concepts["fat"] = concepts["fat n."] concepts["fly (V)"] = concepts["fly v."] concepts["give (V)"] = concepts["give"] concepts["hear (V)"] = concepts["hear"] concepts["kill (V)"] = concepts["kill"] concepts["know (V)"] = concepts["know"] concepts["lie (V)"] = concepts["lie"] concepts["rain (V)"] = concepts["rain"] concepts["say (V)"] = concepts["say"] concepts["see (V)"] = concepts["see"] concepts["sit (V)"] = concepts["sit"] concepts["sleep (V)"] = concepts["sleep"] concepts["stand (V)"] = concepts["stand"] concepts["swim (V)"] = concepts["swim"] concepts["warm (hot)"] = concepts["warm"] concepts["walk (go)"] = concepts["walk(go)"] languages = args.writer.add_languages( lookup_factory="Name", id_factory=lambda x: slug(x["Name"])) args.writer.add_sources() for idx in wl: lexeme = args.writer.add_form( Language_ID=languages[wl[idx, "language"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "ipa"], Form=".".join(wl[idx, "tokens"]).replace("#", "-"), # Segments=wl[idx, 'tokens'], Source="Zhivlov2011", Loan=True if wl[idx, "cogid"] < 0 else False, ) args.writer.add_cognate( lexeme=lexeme, Cognateset_ID=wl[idx, "cogid"], Cognate_Detection_Method="expert", Source=["Zhivlov2011"], )
def cmd_makecldf(self, args): # add sources args.writer.add_sources() # add languages languages = args.writer.add_languages(lookup_factory="Name") # add concepts concepts = args.writer.add_concepts( id_factory=lambda cpt: "%s_%s" % (cpt.id.split("_")[0], slug(cpt.english)), lookup_factory="Name", ) # Hard-coded fixes to segment errors in raw source segments = { "áː": "áː/aː", "âː": "âː/aː", "aʰ": "a h", "ɐ̃ʰ": "ɐ̃ h", "í": "í/i", "íː": "íː/iː", "iʰ": "i h", "i̥": "i̥/i", "ka": "k a", "kw": "kʷ", # the single instance is a labialized velar "nⁱ": "n i", "óː": "óː/oː", "teː": "t eː", "ú": "u/u", '#': '+' } # read wordlist with lingpy wl_file = self.raw_dir / "Bruzzi_Granadillo.txt" wl = lingpy.Wordlist(wl_file.as_posix()) # iterate over wordlist for idx in progressbar(wl, desc="makecldf"): # write lexemes lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "entrj_in_source"], Form=wl[idx, "ipa"], Segments=" ".join( [segments.get(x, x) for x in wl[idx, "tokens"]]).split(), Source=[ "granadillo_ethnographic_2006", "silva_discoteca_1961" ], ) args.writer.add_cognate( lexeme=lex, Cognateset_ID=wl[idx, "cogid"], Source=["Chacon2019"], )
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("suansu.tsv").as_posix()) converter = { "ll": "lː", "ddʑ": "dʑː", "mm": "mː", "nn": "nː", "ss": "sː", "tts": "tsː", "tʂ": "ʈʂː", "bb": "bː", "dd": "dː", "pp": "pː", "tt": "tː", "ttʰ": "tʰː", "ɹɹ": "ɹː", "ff": "fː", "je": "j e", "oj": "oi", "ph": "pʰ", "th": "tʰ", "ttɕ": "tɕː", "ttʃ": "tʃː", "ma": "m a", "ē": "e", "ê": "e", "ʈʈʂ": "ʈʂː", "I": "ɪ", "ʷ": "w", } args.writer.add_sources() concepts = {} args.writer.add_languages() for k in progressbar(wl, desc="wl-to-cldf", total=len(wl)): if wl[k, "concepticon_id"] not in concepts: cid = "{0}_{1}".format(wl[k, "concepticon_id"], slug(wl[k, "concept"])) concepts[wl[k, "concept"]] = cid args.writer.add_concept( ID=cid, Name=wl[k, "concept"], Concepticon_ID=wl[k, "concepticon_id"], Concepticon_Gloss=wl[k, "concepticon_gloss"], ) args.writer.add_form_with_segments( Language_ID="Suansu", Parameter_ID=concepts[wl[k, "concept"]], Value="".join(wl[k, "tokens"]), Form="".join(wl[k, "tokens"]), Segments=" ".join( [converter.get(x, x) for x in wl[k, "tokens"]]).split(), Source=["Ivani2019"], )
def cmd_install(self, **kw): # sources are poorly annotated, so we need to correct manually src = { "H&R92": "huber_vocabulario_1991", "Klumpp95": "", "H&R 1992": "huber_vocabulario_1991", "None": "", "Melendez 2011": "melendez_lozano_diccionario_2011", "Epps": "", "Schauer2005": "", "Allin 1979": "allin_vocabulario_1979", "Aikhenvald": "", "dp91": "", "Aikhenvald 2012": "aikhenvald_dicionario_2012", "Aikenvald2001": "aihenvald_dicionario_2001", "Oliveira 93": "cunha_de_oliveira_uma_1993", "Ramirez2001": "ramirez_dicionario_2001", "Ramirez 2001": "ramirez_dicionario_2001", "Schauer 2005": "schauer_diccionario_2005", "Aikhenvald 2001": "aikhenvald_dicionario_2001" } wl = lingpy.Wordlist( self.raw.posix('arawakan_swadesh_100_edictor.tsv')) with self.cldf as ds: ds.add_sources(*self.raw.read_bib()) for l in self.languages: ds.add_language(ID=slug(l['Name']), Name=l['Name'], Glottocode=l['Glottocode']) for c in self.concepts: ds.add_concept(ID=slug(c['ENGLISH']), Name=c['ENGLISH'], Concepticon_ID=c['CONCEPTICON_ID'], Portuguese_Gloss=c['PORTUGUESE']) for k in pb(wl, desc='wl-to-cldf'): if wl[k, 'value']: for row in ds.add_lexemes( Language_ID=slug(wl[k, 'doculect']), Parameter_ID=slug(wl[k, 'concept']), Value=wl[k, 'value'], Form=wl[k, 'form'], Segments=wl[k, 'segments'], Source=src.get(wl[k, 'source'], '')): cid = slug(wl[k, 'concept'] + '-' + '{0}'.format(wl[k, 'cogid'])) ds.add_cognate(lexeme=row, Cognateset_ID=cid, Source=['Chacon2017'], Alignment=wl[k, 'alignment'], Alignment_Source='Chacon2017')
def cldf(dataset, concepticon, **kw): gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} for dset, srckey in zip(DSETS, sources): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) if 'tokens' not in wl.header: wl.add_entries('tokens', 'ipa', lp.ipa2tokens, merge_vowels=False, expand_nasals=True) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', 'Loan'), dataset, subset=dset.split('.')[0]) as ds: ds.sources.add(src) errors = [] cognates = [] for k in wl: concept = wl[k, 'concept'] if '(V)' in concept: concept = concept[:-4] concept = correct_concepts.get(concept, concept) if concept not in gloss2con: errors += [concept] doculect = correct_languages.get(wl[k, 'doculect'], wl[k, 'doculect']) loan = wl[k, 'cogid'] < 0 cogid = abs(wl[k, 'cogid']) wid = '{0}-{1}'.format(dset.split('.')[0], k) ds.add_row([ wid, lang2glot[doculect], wl[k, 'doculect'], '', gloss2con.get(wl[k, 'concept'], ''), wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan'] ]) cognates.append([ wid, ds.name, wl[k, 'ipa'], cogid, 'borrowed' if loan else '', 'expert', srckey, '', '', '' ]) dataset.cognates.extend( iter_alignments(lp.Alignments(wl), cognates, method='library')) for er in sorted(set(errors)): print(er, dset)
def cmd_makecldf(self, args): # sources are poorly annotated, so we need to correct manually src = { "H&R92": "huber_vocabulario_1992", "H&R 1992": "huber_vocabulario_1992", "Melendez 2011": "melendez_lozano_diccionario_2011", "Allin 1979": "allin_vocabulario_1979", "Aikhenvald 2012": "aikhenvald_dicionario_2012", "Aikenvald2001": "aihenvald_dicionario_2001", "Oliveira 93": "cunha_de_oliveira_uma_1993", "Ramirez2001": "ramirez_dicionario_2001", "Ramirez 2001": "ramirez_dicionario_2001", "Schauer 2005": "schauer_diccionario_2005", "Aikhenvald 2001": "aikhenvald_dicionario_2001", } # add source args.writer.add_sources() # add languages languages = args.writer.add_languages(lookup_factory="Name") # add concepts concepts = args.writer.add_concepts( id_factory=lambda c: "%s_%s" % (c.id.split("-")[-1], slug(c.english)), lookup_factory="Name", ) # read raw wordlist add lexemes wl_file = self.raw_dir / "arawakan_swadesh_100_edictor.tsv" wl = lingpy.Wordlist(wl_file.as_posix()) for idx in progressbar(wl, desc="makecldf"): if wl[idx, "value"]: if wl[idx, 'segments'][0] == '_': wl[idx, 'segments'] = wl[idx, 'segments'][1:] lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, "doculect"]], Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "value"], Form=wl[idx, "form"], Segments=wl[idx, "segments"], Source=src.get(wl[idx, "source"], "Chacon2017"), ) # add cognate args.writer.add_cognate(lexeme=lex, Cognateset_ID=wl[idx, "cogid"], Source=["Chacon2017"])
def cmd_makecldf(self, args): args.writer.add_sources() language_lookup = args.writer.add_languages(lookup_factory="Name") concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="Name") wl = lingpy.Wordlist(self.raw_dir.joinpath("HSH-SCL.csv").as_posix()) for idx in pylexibank.progressbar(wl): args.writer.add_forms_from_value( Language_ID=language_lookup[wl[idx, "language"]], Value=wl[idx, "reflex"], Source=["SoHartmann1988"], Parameter_ID=concept_lookup[wl[idx, "concept"]], )
def prepare(dataset): data = lingpy.csv2list(dataset.get_path('raw', 'data-starostin.tsv'), strip_lines=False) header = [h.lower() for h in data[0]] out = {} idx = 1 for line in data[1:]: char = line[0] coc = line[2] bijiang = line[1] note = line[3] dali = line[4] doc_url = line[5] lhc = line[7] gloss = line[8] jianchuan = line[12] kg = line[14] mch = line[16] pinyin = line[18] rad = line[20] shijing = line[21] if coc.strip(): out[idx] = [ char, pinyin, 'Old_Chinese', 'Classical Old Chinese', coc, rad, kg[:4], kg, gloss ] idx += 1 if lhc.strip(): out[idx] = [ char, pinyin, 'Late_Han_Chinese', 'Eastern Han Chinese', lhc, rad, kg[:4], kg, gloss ] idx += 1 if mch.strip(): out[idx] = [ char, pinyin, 'Middle_Chinese', 'Middle Chinese', mch, rad, kg[:4], kg, gloss ] idx += 1 out[0] = [ 'character', 'pinyin', 'doculect', 'doculect_in_source', 'reading', 'semantic_class', 'phonetic_class', 'karlgren_id', 'gloss' ] dataset.write_wordlist(lingpy.Wordlist(out, row='character'), 'characters')
def cmd_makecldf(self, args): wl = lingpy.Wordlist(self.raw_dir.joinpath("chinese.tsv").as_posix()) maxcogid = 0 args.writer.add_sources() args.writer.add_languages(id_factory=lambda l: l["Name"]) args.writer.add_concepts( id_factory=lambda c: slug(c.label, lowercase=False)) # store list of proto-form to cognate set p2c = {} for k in wl: for row in args.writer.add_lexemes( Language_ID=wl[k, "doculect"], Parameter_ID=slug(wl[k, "concept"], lowercase=False), Value=wl[k, "ipa"], Source="Hamed2006", Cognacy=wl[k, "COGID"], ): args.writer.add_cognate(lexeme=row, Cognateset_ID=wl[k, "cogid"], Source=["Hamed2006", "List2015"]) maxcogid = max([maxcogid, int(wl[k, "cogid"])]) p2c[wl[k, "concept"], wl[k, "proto"]] = wl[k, "cogid"] idx = max([k for k in wl]) + 1 for line in lingpy.csv2list( self.raw_dir.joinpath("old_chinese.csv").as_posix()): for val in line[1].split(", "): cogid = p2c.get((line[0], val)) if not cogid: maxcogid += 1 cogid = p2c[line[0], val] = maxcogid for row in args.writer.add_lexemes( Language_ID="OldChinese", Parameter_ID=slug(line[0], lowercase=False), Value=val, Source="Hamed2006", Cognacy=p2c.get(val, val), ): args.writer.add_cognate(lexeme=row, Cognateset_ID=cogid, Source=["Hamed2006", "List2015"]) idx += 1
def cmd_makecldf(self, args): # Read raw data wl = lingpy.Wordlist(self.raw_dir.joinpath("YN-RGLD.csv").as_posix()) args.writer.add_sources() concept_lookup = args.writer.add_concepts( id_factory=lambda x: x.id.split("-")[-1] + "_" + slug(x.english), lookup_factory="SrcId") language_lookup = args.writer.add_languages(lookup_factory="Name") # add lexemes for idx, language, concept, value in pylexibank.progressbar( wl.iter_rows("doculect", "srcid", "reflex"), desc="make-cldf"): if language in language_lookup and concept in concept_lookup: args.writer.add_forms_from_value( Language_ID=language_lookup[language], Parameter_ID=concept_lookup[concept], Value=value, Source=["Nagano2013"], )
def cmd_makecldf(self, args): wl = lingpy.Wordlist(str(self.raw_dir / "sino-tibetan-raw.tsv")) args.writer.add_sources() concepts = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.id.split("-")[-1] + "_" + slug(concept.english) args.writer.add_concept( ID=idx, TBL_ID=concept.attributes["huang_1992_1820"], Name=concept.english, Coverage=concept.attributes["coverage"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = idx languages, sources = {}, {} for language in self.languages: args.writer.add_language(**language) languages[language["Name_in_Source"]] = language["ID"] sources[language["Name_in_Source"]] = language["Source"] for idx in pylexibank.progressbar(wl, desc="cldfify"): if wl[idx, "tokens"] and " ".join(wl[idx, "tokens"]).strip("+"): row = args.writer.add_form( Language_ID=languages[wl[idx, "doculect"]], Local_ID=idx, Parameter_ID=concepts[wl[idx, "concept"]], Value=wl[idx, "entry_in_source"].strip() or "".join(wl[idx, "tokens"]) or wl[idx, "ipa"], Form=".".join(wl[idx, "tokens"]), Source=sources[wl[idx, "doculect"]].split(","), Comment=wl[idx, "note"], Cognacy=wl[idx, "cogid"], Loan=True if wl[idx, "borrowing"].strip() else False, ) args.writer.add_cognate( lexeme=row, Cognateset_ID="{0}-{1}".format(wl[idx, "cogid"], slug(wl[idx, "concept"])), Source="Sagart2018", Alignment="", Alignment_Source="", )
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} src = getEvoBibAsSource(SOURCE) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.sources.add(src) for k in wl: if wl[k, 'value'] not in '---' and wl[k, 'value'].strip(): ds.add_row([ wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'chinese'], wl[k, 'value'], clean_string(wl[k, 'value'])[0], SOURCE ])
def cmd_makecldf(self, args): wl = lingpy.Wordlist( self.raw_dir.joinpath('D_old-clics.tsv').as_posix()) args.log.info('loaded wordlist') src = { 'wold': 'Wold2009', 'ids': 'Key2007', 'logos': 'Logos2008', 'Språkbanken': 'Saxena2013' } args.writer.add_sources() concepts = set() languages = set() concepticon = { c.id: c.gloss for c in Concepticon().conceptsets.values() } args.log.info('added concepticon') for k in progressbar(wl, desc='wl-to-cldf'): if wl[k, 'value']: if wl[k, 'doculect'] not in languages: args.writer.add_language(ID=slug(wl[k, 'doculect'], lowercase=False), Name=wl[k, 'doculect'], Glottocode=wl[k, 'glottolog']) languages.add(wl[k, 'doculect']) if wl[k, 'concept'] not in concepts: args.writer.add_concept( ID=slug(wl[k, 'concept'], lowercase=False), Name=wl[k, 'concept'], Concepticon_ID=wl[k, 'concepticon_id'], Concepticon_Gloss=concepticon.get( wl[k, 'concepticon_id'], '')) concepts.add(wl[k, 'concept']) args.writer.add_lexemes(Language_ID=slug(wl[k, 'doculect'], lowercase=False), Parameter_ID=slug(wl[k, 'concept'], lowercase=False), Value=wl[k, 'value'], Source=src.get(wl[k, 'source'], ''))
def cmd_makecldf(self, args): concepts = {} wl = lp.Wordlist(self.raw_dir.joinpath('SIN.csv').as_posix(), col='languages') for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.gloss)) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.gloss, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.gloss] = idx concepts['thunder'] = concepts['thunder (verb)'] concepts['lightning'] = concepts['flash (verb)'] concepts['soja sauce'] = concepts['soya sauce'] concepts['light'] = concepts['watery'] concepts['two pairs'] = concepts['two ounces'] languages = args.writer.add_languages( lookup_factory="ID", id_factory=lambda x: x['ID']) args.writer.add_sources() for idx in wl: lexeme = args.writer.add_form( Language_ID=languages[wl[idx, 'languages']], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'ortho'], Form=wl[idx, 'ipa'].replace('#', '-'), Source='Hou2004', Loan=True if wl[idx, 'cogid'] < 0 else False ) args.writer.add_cognate( lexeme=lexeme, Cognateset_ID=wl[idx, 'cogid'], Cognate_Detection_Method='expert', Source=['List2014d'] )
def cmd_makecldf(self, args): wl = lingpy.Wordlist(str(self.raw_dir / 'signalphabets.tsv')) concepts, sources = {}, {} for i, c in enumerate(wl.rows): args.writer.add_concept( ID=str(i + 1), Name=c, ) concepts[c] = str(i + 1) for language in self.languages: args.writer.add_language( ID=language['Name_in_Database'], Name=language['Name'], Latitude=language['Latitude'], Longitude=language['Longitude'], Glottocode=language['Glottolog'], SubGroup=language['SubGroup'], ) sources[language['Name_in_Database']] = language['Source'] sources['Ukranian_SL'] = 'Lydell2018' languages = {language: language for language in sources} languages['Ukranian_SL'] = 'Ukrainian_SL' args.writer.add_sources( *[x for x in self.raw_dir.read_bib() if x.id in sources]) for i, c, l, h1, h2, t, cid in progressbar(wl.iter_rows( 'concept', 'doculect', 'handshape_1', 'handshape_2', 'tokens', 'cogid'), desc='makecldf'): row = args.writer.add_form(Value=h1 + ' ' + h2, Language_ID=languages[l], Parameter_ID=concepts[c], Form=' '.join(t), Source=sources[l]) args.writer.add_cognate( lexeme=row, Cognateset_ID=cid, )
def cmd_makecldf(self, args): wl = lingpy.Wordlist((self.raw_dir / "D_old-clics.tsv").as_posix()) src = {"logos": "Logos2008"} args.writer.add_sources(*self.raw_dir.read_bib()) concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for k in progressbar(wl): if wl[k, "value"]: args.writer.add_language( ID=slug(wl[k, "doculect"], lowercase=False), Name=wl[k, "doculect"], Glottocode=wl[k, "glottolog"], ) args.writer.add_form( Language_ID=slug(wl[k, "doculect"], lowercase=False), Parameter_ID=concepts[wl[k, "concept"]], Value=wl[k, "value"], Form=wl[k, "value"], Source=src.get(wl[k, "source"], ""), )
def cmd_makecldf(self, args): # column "counterpart_doculect" gives us the proper names of the doculects wl = lingpy.Wordlist((self.raw_dir / self.DSETS[0]).as_posix(), col="counterpart_doculect") args.writer.add_sources() language_lookup = args.writer.add_languages( lookup_factory="Name_in_Source") concept_lookup = {} for concept in self.conceptlists[0].concepts.values(): idx = concept.number + "_" + slug(concept.english) args.writer.add_concept( ID=idx, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) for lg in concept.attributes["lexibank_gloss"]: concept_lookup[lg] = idx rows = [(doculect, concept, value, qlcid) for (idx, doculect, concept, value, qlcid) in wl.iter_rows( "counterpart_doculect", "concept", "counterpart", "qlcid") if doculect not in ["English", "Español"]] for doculect, concept, value, qlcid in rows: if value in exclude: continue else: args.writer.add_form( Language_ID=language_lookup[doculect], Parameter_ID=concept_lookup[concept], Value=value, Form=value, Source=["Huber1992"], Local_ID=qlcid, )
def cmd_install(self, **kw): # column "counterpart_doculect" gives us the proper names of the doculects wl = lingpy.Wordlist( self.raw.posix('Huber_filtered_130_basic_cult_voc')) with self.cldf as ds: ds.add_sources(*self.raw.read_bib()) for l in self.languages: ds.add_language(ID=slug(l['Name']), Name=l['Name'], Glottocode=l['Glottocode']) for c in self.concepts: ds.add_concept(ID=slug(c['GLOSS_IN_SOURCE']), Name=c['ENGLISH'], Concepticon_ID=c['CONCEPTICON_ID'] or '', Spanish_Gloss=c['SPANISH']) # specify valid entries in the data valid_entries = [c['GLOSS_IN_SOURCE'] for c in self.concepts] for k in pb(wl, desc='wl-to-cldf'): if wl[k, 'concept'] in valid_entries: for row in ds.add_lexemes(Language_ID=slug(wl[k, 'doculect']), Parameter_ID=slug(wl[k, 'concept']), Value=wl[k, 'counterpart'], Form=wl[k, 'counterpart'], Segments=wl[k, 'tokens'], Source='Huber1992'): cid = slug(wl[k, 'concept'] + '-' + '{0}'.format(wl[k, 'cogid'])) ds.add_cognate(lexeme=row, Cognateset_ID=cid, Source=['Chacon2017'], Alignment=wl[k, 'alignment'], Alignment_Source='Chacon2017')
def cmd_install(self, **kw): wl = lingpy.Wordlist(self.raw.posix('D_subset-300-22.tsv')) source_dict = {} concept_dict = {} sources = {source.id: source for source in self.raw.read_bib()} with self.cldf as ds: ds.add_sources(sources[SOURCE]) for l in self.languages: ds.add_language(ID=l['ID'], Name=l['Name'], Glottocode=l['Glottocode']) source_dict[l['Name']] = [l['Source'], l['ID']] ds.add_sources(sources[l['Source']]) for c in self.concepts: ds.add_concept( ID=c['ID'], Concepticon_ID=c['Concepticon_ID'], Concepticon_Gloss=c['Concepticon_Gloss'], Name=c['Gloss'], ) concept_dict[c['Gloss']] = c['ID'] for k in pb(wl, desc='wl-to-cldf', total=len(wl)): if wl[k, 'tokens']: ds.add_lexemes(Language_ID=source_dict[wl[k, 'doculect']][1], Parameter_ID=concept_dict[wl[k, 'concept']], Value=wl[k, 'ipa'].strip() or ''.join(wl[k, 'tokens']), Form=wl[k, 'ipa'], Segments=wl[k, 'tokens'], Source=[source_dict[wl[k, 'doculect']][0]], Comment=wl[k, 'note'])
def cmd_makecldf(self, args): concepts = {} wl = lp.Wordlist(self.raw_dir.joinpath('IEL.csv').as_posix()) for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Number=concept.number, Name=concept.english, Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, ) concepts[concept.english] = idx languages = args.writer.add_languages( lookup_factory="Name", id_factory=lambda x: slug(x['Name'])) args.writer.add_sources() for idx in wl: lexeme = args.writer.add_form( Language_ID=languages[wl[idx, 'language']], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'originalform'], Form='.'.join(wl[idx, 'tokens']).replace('#', '-'), #Segments=wl[idx, 'tokens'], Source='Dunn2012', Loan=True if wl[idx, 'cogid'] < 0 else False ) args.writer.add_cognate( lexeme=lexeme, Cognateset_ID=wl[idx, 'cogid'], Cognate_Detection_Method='expert', Source=['Dunn2012'] )