def prepare(ds): errs = 0 wl = Wordlist(ds.raw('bds.tsv')) W = {} for k in wl: value = wl[k, 'value'] tokens = wl[k, 'tokens'] doc = wl[k, 'doculect'] if value: morphemes = [] for a, b in _get_slices(wl[k, 'tokens']): ipa = ''.join(tokens[a:b]) morphemes += [ipa] ipa = ' '.join(morphemes) clpa = ds.transform(ipa, 'CLPA') struc = ds.transform(ipa, 'Structure') try: assert len(clpa.split(' ')) == len(struc.split(' ')) except: errs += 1 print(errs, clpa, struc) if '«' in clpa: errs += 1 print(errs, ipa, clpa, struc) W[k] = [ doc, wl[k, 'concept'], wl[k, 'concepticon_id'], value, clpa, struc, wl[k, 'partial_ids'] ] W[0] = [ 'doculect', 'concept', 'concepticon_id', 'value', 'segments', 'structure', 'cogids' ] ds.write_wordlist(Wordlist(W))
def get_inventories(wordlist, segments='tokens'): assert segments in wordlist.header D = {t : defaultdict(list) for t in wordlist.taxa} for taxon in wordlist.taxa: for idx in wordlist.get_list(taxon=taxon, flat=True): tokens = wordlist[idx, segments] print(' '.join(tokens)) slices = _get_slices(tokens) for jdx, (sA, sB) in enumerate(slices): i, m, n, f, t = sinopy.parse_chinese_morphemes(tokens[sA:sB]) pos = '{0}:{1}'.format(idx, jdx) if i != '-': D[taxon]['initial', i] += [pos] if m != '-': D[taxon]['medial', m] += [pos] if n != '-': D[taxon]['nucleus', n] += [pos] if f != '-': D[taxon]['final', f] += [pos] if t != '-': D[taxon]['tone', t] += [pos] I = [('ID', 'DOCULECT', 'CONTEXT', 'VALUE', 'OCCURRENCES', 'CROSSREF')] idx = 1 for t in wordlist.taxa: for (s, v), occ in sorted(D[t].items(), key=lambda x: (x[0][0], x[0][1], len(x[1]))): I += [(str(idx), t, s, v, len(occ), ' '.join(occ))] return I
def test__get_slices(self): a = _get_slices(list('ba²te²')) b = _get_slices(list('ba²te²'), split_on_tones=False) assert a[0][1] == 3 assert b[0][1] == 6
from lingpy import * from lingpy.compare.partial import _get_slices wl = Wordlist('bds.tsv') segments = set() for k in wl: tokens = wl[k, 'tokens'] #print(k, ' '.join(tokens)) #if '(' in tokens: # print(k) # input() if ''.join(tokens): slices = _get_slices(tokens) for a, b in slices: this = tokens[a:b] classes = tokens2class(this, 'cv') if classes[0].lower() == 'c': ini, final = tokens[a:b][0], tokens[a:b][1:] segments.update([(' '.join(ini), 'i'), (' '.join(final), 'f')]) else: segments.add((' '.join(this), 'f')) for seg in segments: print(''.join(seg[0].split(' ')), '\t', seg[1])
def slice_word(word): for a, b in _get_slices(word): yield word[a:b]