def cldf(dataset, concepticon, **kw): unmapped = set() for ods in clld.itercldf(dataset, __name__): lid = ods.name.split('-')[-1] fields = list(ods.fields) + [ 'Language_local_ID', 'Parameter_local_ID', 'Loan', 'Context' ] with CldfDataset(fields, dataset, subset=lid) as ds: ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ clld.url(__name__, path='/meaning/{Parameter_local_ID}') ds.table.schema.columns['Language_local_ID'].valueUrl = \ clld.url(__name__, path='/language/{Language_local_ID}') ds.table.schema.columns['Word_ID'].valueUrl = \ clld.url(__name__, path='/word/{Word_ID}') ds.metadata.update( {k: v for k, v in ods.metadata.items() if k.startswith('dc:')}) ds.sources.add(*ods.sources.items()) for row in ods.rows: if row['Language_ID'] == 'None': row['Language_ID'] = None unmapped.add((row['Language_name'], lid)) keys = list(row.keys()) for i, (form, context) in enumerate(split(row['Value'])): _row = row.to_list() _row[keys.index('Value')] = form _row[keys.index('ID')] = '%s-%s' % (row['ID'], i + 1) # Note: We count words marked as "probably borrowed" as loans. _row.extend([ lid, row['WOLD_Meaning_ID'], float(row['Borrowed_score']) > 0.6, context ]) ds.add_row(_row) assert not unmapped
def cldf(dataset, concepticon, **kw): for dset, srckey in zip(DSETS, SOURCES): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'CLPA', 'Cognacy', 'Partial_cognacy'), dataset, subset=dset.split('-')[0]) as ds: ds.sources.add(src) for k in wl: ds.add_row([ '{0}-{1}'.format(srckey, k), wl[k, 'glottolog'], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens']), ' '.join(wl[k, 'clpa']), wl[k, 'cogid'], ' '.join([str(x) for x in wl[k, 'partialids']]) ]) cognates = [] for k in wl: concept = wl[k, 'concept'] idf = '-'.join([slug(concept), '%s' % wl[k, 'cogid']]) cognates += [[ '{0}-{1}'.format(srckey, k), ds.name, wl[k, 'ipa'], idf, '', 'expert', srckey, '', '', '' ]] dataset.cognates.extend( iter_alignments(wl, cognates, method='progressive', prefix=srckey + '-'))
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } with UnicodeReader( dataset.raw.joinpath(FILENAME.replace('xls', 'Sheet1.csv'))) as r: rows = [row for row in r] concepts = [(i, rows[0][i].replace('_', ' ').strip()) for i in range(1, len(rows[0]), 2)] assert all(concept in concepticon for _, concept in concepts) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.table.schema.columns['Value']['dc:format'] = 'IPA' ds.sources.add(getEvoBibAsSource(SOURCE)) for row in rows[3:]: row = [col.strip() for col in row] if not row[0]: continue lname = row[0] for i, concept in concepts: for j, form in iterforms(row[i]): if form != '?' and form.strip(): ds.add_row([ '%s-%s-%s' % (slug(lname), (i + 1) // 2, j), language_map[lname], lname.replace('_', ' '), concepticon[concept], concept, form, ' '.join(clean_string(form)), SOURCE ]) # three methods: turchin, sca, lexstat, turchin is fast (needs not threshold) cognates = iter_cognates(ds, column='Segments', method='turchin', threshold=0.55) # two methods for alignments: progressive or library dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments', method='progressive')) dataset.write_cognates()
def cldf(dataset, concepticon, **kw): gloss2con = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} lang2glot = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} for dset, srckey in zip(DSETS, sources): wl = lp.Wordlist(dataset.raw.joinpath(dset).as_posix()) if 'tokens' not in wl.header: wl.add_entries('tokens', 'ipa', lp.ipa2tokens, merge_vowels=False, expand_nasals=True) src = getEvoBibAsSource(srckey) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', 'Loan'), dataset, subset=dset.split('.')[0]) as ds: ds.sources.add(src) errors = [] cognates = [] for k in wl: concept = wl[k, 'concept'] if '(V)' in concept: concept = concept[:-4] concept = correct_concepts.get(concept, concept) if concept not in gloss2con: errors += [concept] doculect = correct_languages.get(wl[k, 'doculect'], wl[k, 'doculect']) loan = wl[k, 'cogid'] < 0 cogid = abs(wl[k, 'cogid']) wid = '{0}-{1}'.format(dset.split('.')[0], k) ds.add_row([ wid, lang2glot[doculect], wl[k, 'doculect'], '', gloss2con.get(wl[k, 'concept'], ''), wl[k, 'concept'], wl[k, 'ipa'], srckey, ' '.join(wl[k, 'tokens'] or ['']), cogid, wl[k, 'loan'] ]) cognates.append([ wid, ds.name, wl[k, 'ipa'], cogid, 'borrowed' if loan else '', 'expert', srckey, '', '', '' ]) dataset.cognates.extend( iter_alignments(lp.Alignments(wl), cognates, method='library')) for er in sorted(set(errors)): print(er, dset)
def cldf(dataset, concepticon, **kw): gcode = {x['ID']: x['GLOTTOCODE'] for x in dataset.languages} ccode = { x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values() } data = defaultdict(dict) for fname in dataset.raw.glob('*.csv'): read_csv(fname, data) cognatesets = [] with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: for doculect, wl in data.items(): for concept, (form, loan, cogset) in wl.items(): wid = '%s-%s' % (slug(doculect), slug(concept)) if concept in ccode: csid = ccode[concept] elif concept.startswith('to ') and concept[3:] in ccode: csid = ccode[concept[3:]] else: csid = None ds.add_row([ wid, gcode[doculect.split('-')[0]], doculect, csid, concept, form, '', ]) if cogset: cognatesets.append([ wid, ds.name, form, '%s-%s' % (slug(concept), cogset), False, 'expert', '', '', '', '', ]) segmentize(ds, clean=lambda s: s.split(' ~ ')[0]) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ with CldfDataset(REQUIRED_FIELDS, dataset) as ds: pass
def cldf(dataset, concepticon, **kw): with UnicodeReader(dataset.raw.joinpath('Wang2004.csv'), delimiter='\t') as reader: lines = list(reader) lmap = dict([(x['ABBREVIATION'], (x['GLOTTOCODE'], x['ISO'], x['NAME'])) for x in dataset.languages]) cmap = {c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values()} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Cognacy', ) , dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) idx = 1 cogids = {0: 0} for i, line in enumerate(lines[1:]): concept = line[0] cid = cmap[concept] for t, cogs in zip(lines[0][1:], line[1:]): glottocode, iso, taxon = lmap[t] for cog in cogs.split('/'): if cog in cogids: cogid = cogids[cog] else: cogid = max(list(cogids.values()) or 0) + 1 cogids[cog] = cogid ds.add_row(( idx, glottocode, taxon, iso, cid, concept, cog, SOURCE, cogid)) dataset.cognates.append([ idx, ds.name, cog, '-'.join([slug(concept), str(cogid)]), '', 'expert', SOURCE, '', '', '']) idx += 1
def cldf(dataset, concepticon, **kw): concepts = {x['GLOSS']: x['CONCEPTICON_ID'] for x in dataset.concepts} D = {} # dictionary to be passed to lingpy D[0] = [ 'doculect', 'glottolog', 'concept', 'concepticon', 'ipa', 'segments', 'cogid', 'alignment' ] idx = 1 for f in FILES: msa = lp.MSA( dataset.raw.joinpath('phonalign_{0}.msa'.format(f)).as_posix()) concept = msa.seq_id[1:-1] # strip quotation marks from concept cid = concepts.get(concept, '') for i, taxon in enumerate(msa.taxa): if taxon in languages: tid = languages[taxon] alignment = ' '.join(msa.alignment[i]) tokens = ' '.join([x for x in msa.alignment[i] if x != '-']) ipa = tokens.replace(' ', '') cogid = '{0}-{1}'.format(concept, f) D[idx] = [ taxon, tid, concept, cid, ipa, tokens, cogid, alignment ] idx += 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Cognacy', 'Source'), dataset) as ds: src = getEvoBibAsSource('Heggarty2007') ds.sources.add(src) src = getEvoBibAsSource('List2014e') ds.sources.add(src) alm = lp.Alignments(D) for k in alm: ds.add_row( ['Heggarty2007-{0}'.format(k)] + [alm[k, x] or '' for x in ['glottolog', 'taxon', 'iso', 'concepticon', 'concept', 'ipa']] + \ [' '.join(alm[k, 'tokens']), alm[k, 'cogid'], 'Heggarty2007'] ) dataset.cognates += [[ 'Heggarty2007-{0}'.format(k), ds.name, alm[k, 'ipa'], alm[k, 'cogid'], '', 'expert', 'Heggarty2007', alm[k, 'alignment'], 'expert', 'List2014e' ]] dataset.write_cognates()
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} src = getEvoBibAsSource(SOURCE) with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Segments', 'Source'), dataset) as ds: ds.sources.add(src) for k in wl: if wl[k, 'value'] not in '---' and wl[k, 'value'].strip(): ds.add_row([ wl[k, 'lid'], gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', wl[k, 'concepticon_id'], wl[k, 'concept'], wl[k, 'chinese'], wl[k, 'value'], clean_string(wl[k, 'value'])[0], SOURCE ])
def cldf(dataset, concepticon, **kw): wl = lp.Alignments(dataset.raw.joinpath('tukano.tsv').as_posix()) src1 = getEvoBibAsSource('Chacon2014') src2 = getEvoBibAsSource('Chacon2015') gloss2conc = {r['GLOSS']: r['CONCEPTICON_ID'] for r in dataset.concepts} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy', ), dataset) as ds: ds.sources.add(src1) ds.sources.add(src2) for k in wl: lid = wl[k, 'language'] cogid = wl[k, 'cogid'] concept = wl[k, 'concept'] segments = wl[k, 'tokens'] value = wl[k, 'ipa'] cogid = wl[k, 'cogid'] alignment = wl[k, 'alignment'] name, iso = abbr2lang[lid] concept = wl[k, 'concept'] cid = gloss2conc.get(concept) ds.add_row(('Chacon2014-' + str(k), dataset.glottocode_by_iso.get(iso, ''), name, iso, cid, concept, value, 'Chacon2014', ' '.join(segments), str(cogid))) cogid = '-'.join([slug(wl[k, 'concept']), '%s' % cogid]) dataset.cognates.append([ 'Chacon2014-' + str(k), ds.name, wl[k, 'ipa'], cogid, '', 'expert', 'Chacon2014', alignment, 'expert', 'Chacon2015' ])
def cldf(dataset, concepticon, **kw): unmapped = set() for ods in clld.itercldf(dataset, __name__): lid = ods.name.split('-')[-1] fields = list(ods.fields) + [ 'Language_local_ID', 'Parameter_local_ID', 'Value_in_source' ] with CldfDataset(fields, dataset, subset=lid) as ds: ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ clld.url(__name__, path='/parameters/{Parameter_local_ID}') ds.table.schema.columns['Language_local_ID'].valueUrl = \ clld.url(__name__, path='/contributions/{Language_local_ID}') ds.metadata.update( {k: v for k, v in ods.metadata.items() if k.startswith('dc:')}) ds.sources.add(*ods.sources.items()) for row in ods.rows: if row['Language_ID'] == 'None': row['Language_ID'] = None unmapped.add((row['Language_name'], lid)) val, row['Value'] = row['Value'], clean_form(row['Value']) ds.add_row(row.to_list() + [lid, '-'.join(row['ID'].split('-')[:2]), val])
def cldf(dataset, concepticon, **kw): data = get_all(dataset) gl_map = {k: v.id for k, v in dataset.glottolog_languoids.items()} gl_map.update(dataset.glottocode_by_iso) swadesh_concepts = { k: v for k, v in data['word'].items() if v['id'] in data['concept_ids'] } def normalized_gloss(gloss): if gloss.startswith('to '): gloss = gloss[3:].strip() if '/' in gloss: gloss = gloss.split('/')[0].strip() if '(' in gloss: gloss = gloss.split('(')[0].strip() if gloss.endswith('?'): gloss = gloss[:-1] return gloss swadesh2concepticon = { 'right (hand)': '2183', 'we incl. (pronoun d:1p, incl)': '1131', 'left (hand)': '2182', 'right (correct, true)': '1725', 'in, inside': '1460', 'to lie down': '215', } for conceptlist in [ 'Swadesh-1960-200', 'Swadesh-1971-100', 'Swadesh-1955-100', 'Swadesh-1950-215', 'Swadesh-1955-215' ]: for d in concepticon.conceptlists[conceptlist].concepts.values(): swadesh2concepticon.setdefault(d.english, d.concepticon_id) concept_map = {} for concept in swadesh_concepts.values(): gloss = normalized_gloss(concept['word']) if gloss in swadesh2concepticon: concept_map[concept['id']] = swadesh2concepticon[gloss] elif concept['word'] in swadesh2concepticon: concept_map[concept['id']] = swadesh2concepticon[concept['word']] else: raise ValueError(concept['word']) assert len(concept_map) == len(set(concept_map.values())) for c in dataset.concepts: if c['CONCEPTICON_ID']: concept_map[int(c['ID'])] = c['CONCEPTICON_ID'] or None uc = Counter() unmapped = Unmapped(lambda r: int(r[0])) for language_url, words in groupby( sorted(data['lexicon'].values(), key=lambda i: i['language']), lambda i: i['language']): contribution = data['language'][language_url] with CldfDataset(( 'ID', 'Language_ID', 'Language_iso', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Value', 'Source', 'Cognate_Set', 'Comment', 'Loan', ), dataset, subset=contribution['id']) as ds: cname = contribution['language'] if contribution['dialect']: cname += ' (%s Dialect)' % contribution['dialect'] lid = gl_map.get(contribution['glottocode']) if not lid: lid = gl_map.get(contribution['isocode']) if not lid: unmapped.languages.add( (contribution['id'], cname, contribution['isocode'])) if contribution['information']: ds.metadata['dc:description'] = contribution['information'] ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ '%s/word/{Parameter_local_ID}' % BASE_URL ds.table.schema.columns['Language_local_ID'].valueUrl = \ '%s/language/{Language_local_ID}' % BASE_URL for word in words: concept = data['word'][word['word']] if concept['id'] not in concept_map: unmapped.concepts.add((concept['id'], concept['word'])) uc.update([concept['word']]) src = data['source'].get(word['source']) if src: ds.sources.add( Source('misc', src['slug'], author=src['author'], year=src['year'], transnewguinea_id=BASE_URL + '/source/' + src['slug'], title=src['reference'])) ds.add_row([ word['id'], lid, contribution['isocode'], cname, contribution['slug'], concept_map.get(concept['id']), concept['word'], concept['slug'], word['entry'], src['slug'] if src else None, None, word['annotation'], word['loan'], ]) unmapped.pprint()
def cldf(dataset, concepticon, **kw): concept_map = { int(c['GLOSS']): c['CONCEPTICON_ID'] or None for c in dataset.concepts } gc_pattern = re.compile('[a-z0-9]{4}[1-9][0-9]{3}$') meta = {} for row in read_csv(dataset, 'META'): meta[(row[5], row[9])] = dict( zip( 'NAME,COUNTRY,ISO,GLOTTO_NAME,GLOTTO_CODE,LG_LINK,AUDIO,SOURCE,NR_SETS,VARIANT' .lower().split(','), row)) sources = {} sid = 0 for spec in meta.values(): if spec['source'] and spec['source'] not in sources: sid += 1 sources[spec['source']] = Source('misc', 's%s' % sid, title=spec['source']) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Comment', ), dataset) as ds: for key, items in groupby( sorted(read_csv(dataset, 'NUMERAL'), key=lambda r: (r[2], r[3], r[0])), lambda r: (r[2], r[3])): if key not in meta: continue if int(float(key[1])) > 1: continue md = meta[key] source, ref = sources.get(md['source']), None if source: ds.sources.add(source) ref = source.id if gc_pattern.match(md['glotto_code']): for concept, rows in groupby(items, lambda k: k[0]): if not concept.endswith('.0'): continue iconcept = int(float(concept)) if iconcept not in concept_map: unmapped.concepts.add((iconcept, iconcept)) for k, row in enumerate(rows): ds.add_row([ '%s-%s-%s' % (lgid(row[2]), iconcept, k + 1), md['glotto_code'], md['name'], concept_map.get(iconcept), '%s' % iconcept, row[1], ref, row[4] or None, ]) unmapped.pprint()
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } header, rows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Wordlists.ActualWordlists.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: header = row if i > 0: rows.append(row) cheader, crows = None, [] with UnicodeReader( dataset.raw.joinpath( 'Semitic.Codings.Multistate.Sheet1.csv')) as reader: for i, row in enumerate(reader): row = [c.strip() for c in row] if i == 0: cheader = row if i > 0: crows.append(row) langs = header[1:] clean_langs = { """Gɛ'ɛz""": "Ge'ez", "Tigrɛ": "Tigre", 'ʷalani': "Walani", "Ogadɛn Arabic": "Ogaden Arabic", "Mɛhri": "Mehri", "Gibbali": "Jibbali", } correct_concepts = { 'Cold (air)': 'Cold (of air)', } src = getEvoBibAsSource('Kitchen2012') with CldfDataset(('ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments'), dataset) as ds: D = {0: ['doculect', 'concept', 'ipa', 'tokens']} idx = 1 ds.sources.add(src) for row in rows: concept = row[0] for i, col in enumerate(row[1:]): lang = langs[i] if col != '---': cleaned_string = clean_string(col, merge_vowels=False, preparse=PREPARSE, rules=CONVERSION, semi_diacritics='')[0] ds.add_row([ 'Kitchen2012-' + str(idx), language_map[lang], clean_langs.get(lang, lang), concepticon[concept], concept, col, cleaned_string ]) D[idx] = [ clean_langs.get(lang, lang), concept, col, cleaned_string ] idx += 1 wl = lp.Wordlist(D) id2cog = {} errors = [] for row in crows: taxon = row[0] for i, (concept, cog) in enumerate(zip(cheader[1:], row[1:])): nconcept = rows[i][0] if cog != '-': idxs = wl.get_dict(taxon=taxon) if idxs.get(nconcept, ''): id2cog[idxs[nconcept][0]] = concept + '-' + cog else: errors += [(concept, nconcept, taxon)] bad_cogs = 1 cognates = [] for k in wl: cognates = [] if k in id2cog: cogid = id2cog[k] else: cogid = str(bad_cogs) bad_cogs += 1 id2cog[k] = cogid wl.add_entries('cog', id2cog, lambda x: x) wl.renumber('cog') for k in wl: cognates += [[ 'Kitchen2012-' + str(k), ds.name, wl[k, 'ipa'], wl[k, 'concept'] + '-' + str(wl[k, 'cogid']), '', 'expert', 'Kitchen2012', '', '', '' ]] dataset.cognates.extend(iter_alignments(lp.Alignments(wl), cognates))
def cldf(dataset, concepticon, **kw): concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } glotto_map = {c['NAME']: c['GLOTTOCODE'] for c in dataset.languages} # retrieve coordinates coords = {} langs = [] # language map, as the names are not identical language_map = { "Namhsan": "Nam Hsan", "Pangkham": "Pang Kham", "Xiang Zhai Tang (Xiang Cai Tang)": "Xiang Zhai Tang" } with UnicodeReader( dataset.raw.joinpath('100item-phylo.Sheet2.csv')) as reader: for i, (num, lat, lon, village, country) in enumerate(reader): if i >= 1: coords[language_map.get(village, village)] = (lat, lon) langs.append(language_map.get(village, village)) cognates = [] idx = 1 with UnicodeReader(dataset.raw.joinpath('100item-phylo.Sheet1.csv'), delimiter=',') as reader,\ CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Segments', 'Cognacy' ), dataset) as ds: ds.sources.add(getEvoBibAsSource('Deepadung2015')) ds.metadata['coordinates'] = coords data = list(reader) header = data[2][2:] for i, row in enumerate(data[5:]): row = [c.strip() for c in row] concept = row[1] cid = concept_map[concept] for j in range(0, len(header), 2): lang = language_map.get(header[j], header[j]) gcid = glotto_map[lang] cog = slug(concept) + '-' + row[2:][j + 1] certainty = 0 if ' or ' in cog: cog = cog.split(' ')[0] certainty = 1 word = CORRECT.get(row[2:][j], row[2:][j]) if word.strip() and ''.join(set(word.strip())) != '-': segments = lp.sequence.sound_classes.clean_string( word, splitters=',', rules=CONVERSION, preparse=PREPARSE, semi_diacritics="")[0] cogid = slug(concept) + '-' + cog ds.add_row([ idx, gcid, lang, '', cid, concept, word, PROVIDER, segments, cogid ]) cognates.append([ idx, ds.name, word, cogid, str(certainty), 'expert', PROVIDER, '', '', '' ]) idx += 1 dataset.cognates.extend( iter_alignments( ds, cognates, method='progressive', ))
def cldf(dataset, concepticon, **kw): concept_map = { re.sub('^(\*|\$)', '', c.english): c.concepticon_id for c in dataset.conceptlist.concepts.values()} for c in dataset.concepts: concept_map[(c['ID'], c['GLOSS'])] = c['CONCEPTICON_ID'] or None language_map = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages} concepts = [] languages = {} for path in dataset.raw.glob('languages-language-*.json'): data = jsonlib.load(path) data['glottocode'] = language_map[data['id']] languages[data['id']] = data for path in sorted( dataset.raw.glob('lexical-feature-*.json'), key=lambda p: int(p.stem.split('-')[-1])): data = jsonlib.load(path) data['concepticon'] = concept_map.get(data['concept']) if not data['concepticon']: data['concepticon'] = concept_map[(data['id'], data['concept'])] concepts.append(data) fields = defaultdict(lambda: Counter()) sources = {} with CldfDataset(( 'ID', 'Language_ID', 'Language_iso', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Semantic_field', 'Value', 'Context', 'Loan', 'Phonemic', 'Source', 'Creator', 'Comment', ), dataset) as ds: ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ 'https://huntergatherer.la.utexas.edu/lexical/feature/{Parameter_local_ID}' ds.table.schema.columns['Language_local_ID'].valueUrl = \ 'https://huntergatherer.la.utexas.edu/languages/language/{Language_local_ID}' for param in concepts: for lid, items in groupby( sorted(param['items'], key=lambda i: i['Language']), lambda i: i['Language']): lid = lid.split('/')[-1] if lid in missing_languages: continue lang = languages[lid] i = 0 for item in items: form = item['Orthographic Form'].strip() refs = [ref for ref in itersources(item, lang, sources) if ref] ds.sources.add(*[ref.source for ref in refs]) for k, v in item.items(): if v: fields[k].update([v]) for fform, context in split(form): i += 1 ds.add_row([ '%s-%s-%s' % (lid, param['id'], i), lang['glottocode'], lang['ISO 639-3'], lang['name'], lang['id'], param['concepticon'], param['concept'], param['id'], param['Semantic Field'], fform, context, bool(item['Loan Source'] or item['Wanderwort Status']), item['Phonemicized Form'] or None, ';'.join('%s' % ref for ref in refs), item.get('Created By'), item.get('General Notes'), ])
def cldf(dataset, concepticon, **kw): wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages} ccode = {x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2015d') with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Source', 'Cognacy', ) , dataset) as ds: ds.sources.add(src, src2) # store list of proto-form to cognate set p2c = {} for k in wl: ds.add_row([ '{0}-{1}'.format(SOURCE, k), gcode[wl[k, 'doculect']], wl[k, 'doculect'], '', ccode[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'ipa'], SOURCE, wl[k, 'COGID'] ]) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, k), ds.name, wl[k, 'ipa'], '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ]] p2c[wl[k, 'proto']] = wl[k, 'cogid'] idx = max([k for k in wl]) + 1 for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()): for val in line[1].split(', '): ds.add_row(( '{0}-{1}'.format(SOURCE, idx), 'sini1245', 'Old Chinese', '', ccode[line[0]], line[0], val, SOURCE, p2c.get(val, val) )) dataset.cognates += [[ '{0}-{1}'.format(SOURCE, idx), ds.name, val, '-'.join([slug(line[0]), text_type(p2c.get(val, val))]), '', 'expert', SOURCE, '', '', '']] idx += 1
def cldf(dataset, concepticon, **kw): concepticon = { x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} lmap = {l['ID']: l['GLOTTOCODE'] or None for l in dataset.languages} lmap_name = {l['ID']: l['NAME'] or None for l in dataset.languages} cognate_sets = defaultdict(list) for (cid, c), w, missing in parse(dataset.raw.joinpath('galucio-tupi.txt'), lmap): assert c in concepticon if c in LANGUAGE_ID_FIXES: f, t = LANGUAGE_ID_FIXES[c] w = re.sub(f + '\s+', t + ' ', w, count=1) missing = re.sub(f + '\s+', t + ' ', missing, count=1) if missing: assert re.match( '((?P<lid>%s)\s*\?\s*)+$' % '|'.join(list(lmap.keys())), missing) missing = missing.replace('?', ' ').split() lids = set(missing[:]) for m in re.finditer('(?P<lid>[A-Z][a-z])\s+', w): lids.add(m.group('lid')) # make sure all language IDs are valid assert not lids.difference(set(lmap.keys())) nlids = missing[:] for cs in iter_cogsets(w, lmap): cognate_sets[(cid, c)].append(cs) nlids.extend(list(cs.keys())) nlids = set(nlids) assert nlids == lids # make sure we found all expected language IDs cognatesets = [] with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Value', 'Segments'), dataset) as ds: for (cid, concept), cogsets in cognate_sets.items(): for j, cogset in enumerate(cogsets): for lid, words in sorted(cogset.items(), key=lambda k: k[0]): for i, word in enumerate(words): wid = '%s-%s-%s-%s' % (lid, cid, j + 1, i + 1) ds.add_row([ wid, lmap[lid], lmap_name[lid], lid, concepticon[concept], concept, cid, word, '', ]) cognatesets.append([ wid, ds.name, word, '%s-%s' % (cid, j + 1), False, 'expert', '', '', '', '', ]) segmentize(ds, clean=lambda s: s.split(' ~ ')[0]) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))
def to_cldf(self, concept_map, unmapped, citekey=None, source=None, concept_key=None): if concept_key is None: concept_key = lambda entry: entry.word_id if not self.language.glottocode: unmapped.languages.add( (self.language.id, self.language.name, self.language.iso)) with CldfDataset(( 'ID', 'Language_ID', 'Language_iso', 'Language_name', 'Language_local_ID', 'Parameter_ID', 'Parameter_name', 'Parameter_local_ID', 'Value', 'Value_in_source', 'Segments', 'Context', 'Source', 'Cognate_Set', 'Comment', 'Loan', ), self.dataset, subset=self.language.id) as ds: ds.metadata['dc:creator'] = self.language.author ds.metadata['dc:identifier'] = self.url('language.php?id=%s' % self.language.id) if self.language.typedby: ds.metadata['dc:contributor'] = self.language.typedby if self.language.checkedby: ds.metadata['dc:contributor'] = self.language.checkedby if self.language.notes: ds.metadata['dc:description'] = self.language.notes ds.table.schema.aboutUrl = '%s.csv#{ID}' % ds.name ds.table.schema.columns['Loan'].datatype = 'boolean' ds.table.schema.columns['Parameter_local_ID'].valueUrl = \ self.url('word.php?v=1{Parameter_local_ID}') ds.table.schema.columns['Language_local_ID'].valueUrl = \ self.url('language.php?id={Language_local_ID}') ref = None if citekey and source: ref = citekey ds.sources.add(Source('misc', citekey, title=source)) for entry in self.entries: if entry.name == '?': continue if not (citekey and source): src = entry.e.find('source') if src and getattr(src, 'text'): ref = slug(text_type(src.text)) ds.sources.add(Source('misc', ref, title=src.text)) cid = concept_map.get(concept_key(entry)) if not cid: unmapped.concepts.add((entry.word_id, entry.word)) for i, (form, context) in enumerate(util.split(entry.name)): ds.add_row([ '{0}-{1}'.format(entry.id, i + 1), self.language.glottocode, self.language.iso, self.language.name, self.language.id, cid, entry.word, entry.word_id, util.clean_form(form), form, '', context, ref, entry.cognacy, entry.comment or '', entry.loan == 'L', ]) segmentize(ds) return ds
def cldf(dataset, concepticon, **kw): concepticon = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concepticon['you (sing.)'] = concepticon['you (sing.) (thou)'] concepticon['you (pl.)'] = concepticon['you (pl.) (ye)'] concepticon['to itch/itchy'] = concepticon['to itch/to be itchy'] concepticon['medicine'] = concepticon['medicine/juice'] concepticon['excrement/shit'] = concepticon['feces/excrement/shit'] language_map = { 'Tampuon': 'Tampuan', 'Palaung-Namhsan-Taunggyi': 'Palaung-Namhsan', 'Jru-Laven\u02d0': 'Jru-Laven', 'Pnar-Jaintia': 'Pnar', 'K-Surin': 'Khmer-Surin', } languages = {} words = [] with UnicodeReader(dataset.raw.joinpath('ds.Sheet1.csv')) as reader: for i, row in enumerate(reader): if 3 <= i < 125: languages[row[1]] = row elif i > 334: words.append(row) lids = [int(float(r[0])) for r in languages.values()] assert min(lids) == 1 and max(lids) == 122 glottolog = dataset.glottocode_by_iso glottolog.update( {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages}) sources = {} for src, langs in groupby(sorted(languages.values(), key=lambda r: r[6]), lambda r: r[6]): langs = [l[1] for l in langs] src = Source('misc', '_'.join(map(slug, langs)), title=src) for lang in langs: sources[lang] = src sources['cognates'] = getEvoBibAsSource(SOURCE) unmapped = Unmapped() with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(*sources.values()) D = {0: ['lid', 'doculect', 'concept', 'ipa', 'tokens', 'cog']} for i, row in enumerate(words): form = row[4] if not form or form in '*-': continue assert row[1] in concepticon lang = language_map.get(row[3], row[3].strip()) assert lang in languages gc = glottolog.get(glottolog.get(languages[lang][7]), lang) if not gc: unmapped.languages.add(('', lang, languages[lang][7])) # get segments segments = clean_string(form)[0] # get cognate identifier cogid = row[5] if row[5].strip() and row[5].strip() != '*' else ( 'e%s' % i) cogid = row[1] + '-' + cogid lid = '{0}-{1}'.format(ds.name, i + 1) ds.add_row([ lid, glottolog.get(lang, glottolog.get(languages[lang][7])), lang, languages[lang][7], concepticon[row[1]], row[1], form, segments, sources[lang].id, None ]) D[i + 1] = [lid, lang, row[1], form, segments, cogid] wl = lp.Wordlist(D) wl.renumber('cog') alm = lp.Alignments(wl) dataset.cognates.extend( iter_alignments(alm, wordlist2cognates(wl, ds, SOURCE))) unmapped.pprint()
def cldf(dataset, concepticon, **kw): """ Implements the conversion of the raw data to CLDF dataset(s). :param dataset: provides access to the information in supplementary files as follows:\ - the JSON object from `metadata.json` is available as `dataset.md`\ - items from languages.csv are available as `dataset.languages`\ - items from concepts.csv are available as `dataset.concepts`\ - if a Concepticon conceptlist was specified in metadata.json, its ID is available\ as `dataset.conceptlist` :param glottolog: a pyglottolog.api.Glottolog` instance. :param concepticon: a pyconcepticon.api.Concepticon` instance. :param kw: All arguments passed on the command line. """ wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix()) # get language identifiers lids, cids, coords = {}, {}, {} for row in dataset.languages: language = row['NAME'] lids[language] = row['GLOTTOCODE'] coords = dict([wl.coords[taxon] for taxon in lids]) modify = { 'thunder (verb)': 'thunder', 'flash (verb)': 'lightning', 'room': 'flat', 'have diarrea': 'have diarrhoea', 'watery': 'light' } for row in dataset.concepts: concept = modify[row['CONCEPT']] if row['CONCEPT'] in modify else \ row['CONCEPT'] cids[concept] = row['CONCEPT_SET'] # language ids src = getEvoBibAsSource(SOURCE) src2 = getEvoBibAsSource('List2014b') # get partial identifiers partial_ids = defaultdict(list) partial_converter = {} idx = 1 for k in wl: for char in wl[k, 'counterpart']: if char in partial_converter: pidx = partial_converter[char] else: pidx = idx partial_converter[char] = idx idx += 1 partial_ids[k] += [pidx] # trace if proto-langugages was visited visited = [] idx = max([k for k in wl]) + 1 with CldfDataset( ('ID', 'Language_ID', 'Language_name', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Parameter_Chinese_name', 'Value', 'Value_Chinese_characters', 'Source', 'Segments', 'Cognacy', 'Rank', 'Comment'), dataset) as ds: ds.sources.add(src) ds.sources.add(src2) D = {0: ['doculect', 'concept', 'ipa', 'tokens', 'cogid']} for k in wl: tokens = lp.ipa2tokens(wl[k, 'ipa'], merge_vowels=False, expand_nasals=True) # remove sandhi-annotation in tokens, as it is confusing clpa for i, t in enumerate(tokens): if '⁻' in t: tokens[i] = t[:t.index('⁻')] ds.add_row([ '{0}-{1}'.format(SOURCE, k), lids[wl[k, 'doculect']], wl[k, 'doculect'], '', cids[wl[k, 'concept']], wl[k, 'concept'], wl[k, 'mandarin'], wl[k, 'ipa'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], wl[k, 'order'], wl[k, 'note'] if wl[k, 'note'] != '-' else '', ]) D[k] = [ wl[k, 'doculect'], wl[k, 'concept'], wl[k, 'ipa'], tokens, wl[k, 'cogid'] ] if wl[k, 'cogid'] not in visited: # we need to add new tones, otherwise it won't work, so we # split syllables first, then check if the syllable ends with # tone or not and add a '1' if this is not the case syllables = wl[k, 'mch'].split('.') for i, s in enumerate(syllables): if s[-1] not in '²³': if s[-1] not in 'ptk': syllables[i] += '¹' else: syllables[i] += '⁴' tokens = lp.ipa2tokens(''.join(syllables)) ds.add_row([ '{0}-{1}'.format(wl[k, 'concept'], idx), 'sini1245', 'Middle Chinese', '', cids[wl[k, 'concept']], wl[k, 'concept'], '', wl[k, 'proto'], wl[k, 'counterpart'], SOURCE, ' '.join(tokens), wl[k, 'cogid'], '', '' ]) D[idx] = [ 'Middle Chinese', wl[k, 'concept'], wl[k, 'mch'], tokens, wl[k, 'cogid'] ] idx += 1 visited += [wl[k, 'cogid']] alms = lp.Alignments(D) cognates = [[ '{0}-{1}'.format(SOURCE, k), ds.name, alms[k, 'ipa'], '-'.join([slug(alms[k, 'concept']), str(alms[k, 'cogid'])]), '', 'expert', SOURCE, '', '', '' ] for k in alms] dataset.cognates.extend( iter_alignments(alms, cognates, method='library'))
def cldf(dataset, concepticon, **kw): language_map = {l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages} concept_map = { x.english: x.concepticon_id for x in dataset.conceptlist.concepts.values()} data = OrderedDict() # The english concept labels in the two excel sheets differ in one place: gloss_map = {'road/path': 'road'} header, rows = read_csv(dataset, 'Data') for row in rows: data[row[0]] = { 'language': row[0], 'source': row[-1], 'items': OrderedDict(zip(header[1:-2], row[1:-2])), } ids = [slug(l['language']) for l in data.values()] assert len(set(ids)) == len(ids) header, rows = read_csv(dataset, 'Multistate') for row in rows: ldata = data[row[0]] for j, csid in enumerate(row[1:]): concept = header[j + 1] try: csid = '%s' % int(float(csid)) except ValueError: assert csid == '?' ldata['items'][gloss_map.get(concept, concept)] = ( ldata['items'][gloss_map.get(concept, concept)], csid) unmapped = Unmapped() sources = {} with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Cognacy', ), dataset) as ds: for lang in data.values(): if not language_map[lang['language']]: unmapped.languages.add((lang['language'], lang['language'], '')) ref = '' if lang['source']: ref = get_ref(lang, sources) if ref: ds.sources.add(ref.source) ref = '%s' % ref for concept, item in lang['items'].items(): if concept not in concept_map: unmapped.concepts.add((slug(concept), concept)) wid = '%s-%s' % (slug(lang['language']), slug(concept)) if ds.add_row([ wid, language_map[lang['language']], lang['language'], concept_map.get(concept), concept, item[0] if clean_string_with_validation(item[0]) else None, clean_string_with_validation(item[0]), ref, item[1], ]) and item[1] != '?': dataset.cognates.append([ wid, ds.name, item[0], '%s-%s' % (slug(concept), item[1]), False, 'expert', '', '', '', '', ]) dataset.write_cognates() unmapped.pprint()
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } concept_map[ 'year'] = '1226' # dunno why this is missing, it's 200 words... wordlists = list(read_csv(dataset)) cogsets = defaultdict(lambda: defaultdict(list)) for wl in wordlists: for concept, (words, cogids) in wl.words.items(): if len(cogids) == 1: cogsets[concept][cogids[0]].append(words[0]) with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'Source', 'Comment', ), dataset) as ds: ds.sources.add(getEvoBibAsSource(SOURCE)) cognates = [] for wl in wordlists: #print(wl.language) for concept, (words, cogids) in wl.words.items(): if len(cogids) > 1: if len(words) < len(cogids): if len(words) == 1: if ':' in words[0]: words = words[0].split(':') if ',' in words[0]: words = words[0].split(',') assert len(words) >= len(cogids) assert (wl.language, concept) in COGSET_MAP if len(words) > len(cogids): assert (wl.language, concept) in COGSET_MAP if (wl.language, concept) in COGSET_MAP: word_to_cogid = COGSET_MAP[(wl.language, concept)] else: word_to_cogid = dict(izip_longest(words, cogids)) for i, word in enumerate(words): if word.startswith('(') and word.endswith(')'): word = word[1:-1].strip() wid = '%s-%s-%s' % (slug( wl.language), slug(concept), i + 1) ds.add_row([ wid, '', wl.language, concept_map.get(concept, ''), concept, word, clean_string(word, splitters='?')[0], SOURCE, '', ]) if word_to_cogid.get(word): cognates.append([ wid, ds.name, word, '%s-%s' % (slug(concept), word_to_cogid[word]), False, 'expert', SOURCE, '', '', '', ]) dataset.cognates.extend( iter_alignments(ds, cognates, column='Segments'))
def cldf(dataset, concepticon, **kw): language_map = { l['NAME']: l['GLOTTOCODE'] or None for l in dataset.languages } concept_map = { c.english: c.concepticon_id for c in dataset.conceptlist.concepts.values() } wordsh, words = read_csv(dataset, 'supplementary.Sheet1.csv', 0) cognatesh, cognates = read_csv(dataset, 'Japonic_recovered.Sheet1.csv', 1) def concepts(h, step): l = h[2:] return {i + 2: l[i] for i in range(0, len(l), step)} word_index_to_concept = concepts(wordsh, 1) assert all(c in concept_map for c in word_index_to_concept.values()) assert len(words) == len(cognates) def sorted_(l): return sorted(l, key=lambda r: r[:2]) cognatesets = [] with CldfDataset(( 'ID', 'Language_ID', 'Language_name', 'Parameter_ID', 'Parameter_name', 'Value', 'Segments', 'AltTranscription', ), dataset) as ds: for i, (word, cognate) in enumerate(zip(sorted_(words), sorted_(cognates))): if not word[1]: continue if word[1] == 'Nigata': word[1] = 'Niigata' assert word[:2] == cognate[:2] lname = word[1] lid = slug(lname) for index, concept in word_index_to_concept.items(): if word[index] == '?': continue wid = '%s-%s' % (lid, index - 1) cindex = (index - 1) * 2 assert cognatesh[cindex] == concept ds.add_row([ wid, language_map[lname], lname, concept_map[concept], concept, word[index], '', cognate[cindex], ]) cs = cognate[cindex + 1] for css in cs.split('&'): css = css.strip() if css != '?': css = int(float(css)) cognatesets.append([ wid, ds.name, word[index], '%s-%s' % (index - 1, css), False, 'expert', '', '', '', '', ]) segmentize(ds) dataset.cognates.extend(iter_alignments(ds, cognatesets, column='Segments'))