def check(self, column='TOKENS', rules=False, clpa=None): clpa = clpa or get_clpa() if rules: rules = load_alias(rules) for val in self: tokens = [ rules[t] if t in rules else t for t in split(val[column]) ] val[column] = join(tokens) sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0}) for item in self: new_tokens, sounds, errors = clpa.check_sequence(split( item[column]), sounds=sounds, errors=errors) idxs = [clpa.segment2clpa(t) for t in new_tokens] # new_tokens.append(accent + sounds[token]['clpa']) # idxs.append(sounds[token]['id']) item['CLPA_TOKENS'] = join(new_tokens) item['CLPA_IDS'] = join(idxs) return sounds, errors
def check(self, column='TOKENS', rules=False, clpa=None): clpa = clpa or get_clpa() if rules: rules = load_alias(rules) for val in self: tokens = [rules[t] if t in rules else t for t in split(val[column])] val[column] = join(tokens) sounds, errors = {}, Counter({'convertable': 0, 'non-convertable': 0}) for item in self: new_tokens, sounds, errors = clpa.check_sequence( split(item[column]), sounds=sounds, errors=errors) idxs = [clpa.segment2clpa(t) for t in new_tokens] # new_tokens.append(accent + sounds[token]['clpa']) # idxs.append(sounds[token]['id']) item['CLPA_TOKENS'] = join(new_tokens) item['CLPA_IDS'] = join(idxs) return sounds, errors
def inventories(dataset): clpa = get_clpa() files = glob(dataset.get_path('raw', 'inventories.tsv')) dialects = [] t = Tokenizer(dataset.get_path('raw', 'profile.prf')) sounds = defaultdict(lambda: defaultdict(set)) transform = lambda x, y: unicodedata.normalize('NFC', t.transform(x, y)) invs = {l: [] for l in dataset.languages} for f in files: data = csv2list(f) for i, line in enumerate(data): number, dialect, page, sound, value, *rest = line if not rest: rest = [''] cddb = transform(value, 'CDDB') src = transform(value, 'SOURCE') struct = ' '.join(list(t.transform(value, 'STRUCTURE'))) invs[dialect] += [[ src.replace(' ', ''), cddb, struct, ', '.join(rest) ]] if len(struct.split()) != len(cddb.split()): print(i + 1, 'warn', struct, ' | ', cddb) dataset.write_inventories(invs)
def setUp(self): self.clpa = get_clpa() self.clpa2 = CLPA(rules=dict(th='t'))
# coding=utf-8 from __future__ import unicode_literals, print_function from collections import defaultdict, Counter from clldutils.misc import slug from six.moves.urllib.request import urlopen from lingpy.sequence.sound_classes import clean_string, tokens2class import lingpy as lp from pyclpa.base import get_clpa from pybtex import database clpa = get_clpa() def getEvoBibAsSource(key): """Download bibtex format and parse it from EvoBib""" return database.parse_string( urlopen("http://bibliography.lingpy.org/raw.php?key=" + key).read().decode('utf-8'), bib_format='bibtex') def wordlist2cognates(wordlist, dataset, source, expert='expert', ref='cogid'): """Turn a wordlist into a cognate set list, using the cldf parameters.""" return [[ wordlist[k, 'lid'], dataset.name, wordlist[k, 'ipa'], '{0}-{1}'.format(slug(wordlist[k, 'concept']), wordlist[k, ref]), '', expert, source, '', '', '' ] for k in wordlist]