def test_tokens2morphemes(): seq1 = "t i a o ¹ b u ² d a o".split(' ') seq2 = "t i a o ¹ + b u ² # d a o".split(' ') seq3 = "t i a o ¹ b u _ d a o".split(' ') seq4 = "t i a o murks w a o".split(' ') assert len(tokens2morphemes(seq1)) == 3 assert len(tokens2morphemes(seq2)) == 3 assert len(tokens2morphemes(seq3)) == 2 assert len(tokens2morphemes(seq4, sep='murks')) == 2 assert_raises(ValueError, tokens2morphemes, "t i a o")
def test_tokens2morphemes(): seq1 = "t i a o ¹ b u ² d a o".split(' ') seq2 = "t i a o ¹ + b u ² # d a o".split(' ') seq3 = "t i a o ¹ b u _ d a o".split(' ') seq4 = "t i a o murks w a o".split(' ') assert len(tokens2morphemes(seq1)) == 3 assert len(tokens2morphemes(seq2)) == 3 assert len(tokens2morphemes(seq3)) == 2 assert len(tokens2morphemes(seq4, sep='murks')) == 2 assert len(tokens2morphemes(seq1, split_on_tones=False)) == 1 with pytest.raises(ValueError): tokens2morphemes("t i a o") assert len(tokens2morphemes(list("b++t"))) == 2
def get_structure(word, sep='+', zipped=False, semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'): if not isinstance(word, (list, tuple)): word = lingpy.ipa2tokens(word, expand_nasals=True, merge_vowels=False, semi_diacritics=semi_diacritics) # check for unknown chars try: tokens2class(word, 'cv', cldf=True) except ValueError: print('problem with {0}'.format(''.join(word))) return [] # get the morphemes if sep in word: words = tokens2morphemes(word, cldf=True) morphemes = [] for w in words: morphemes += tokens2morphemes(w, sep=sep) else: morphemes = tokens2morphemes(word, cldf=True) # get the basic structure for each morpheme for morpheme in morphemes: try: segments = parse_chinese_morphemes(morpheme) except: if not zipped: yield ['NULL'] else: yield ([('NULL', 'NULL')], morpheme) if not zipped: yield [x for x, y in zip('imnct', segments) if y != '-'] else: yield ([x for x in zip('imnct', segments) if x[1] != '-'], morpheme)
def test_tokens2morphemes(self): seq1 = "t i a o ¹ b u ² d a o".split(' ') seq2 = "t i a o ¹ + b u ² # d a o".split(' ') seq3 = "t i a o ¹ b u _ d a o".split(' ') seq4 = "t i a o murks w a o".split(' ') assert len(tokens2morphemes(seq1)) == 3 assert len(tokens2morphemes(seq2)) == 3 assert len(tokens2morphemes(seq3)) == 2 assert len(tokens2morphemes(seq4, sep='murks')) == 2 assert len(tokens2morphemes(seq1, split_on_tones=False)) == 1 assert_raises(ValueError, tokens2morphemes, "t i a o") assert len(tokens2morphemes(list("b++t"))) == 2
wl = Wordlist('../tsv/burmish.tsv') G = nx.Graph() txt = '' for taxon in wl.taxa: M = {} data = wl.get_dict(taxon=taxon, flat=True) words = [] syllables = [] for c,idxs in data.items(): for idx in idxs: tokens = wl[idx, 'tokens'] words += [' '.join(tokens)] morphemes = tokens2morphemes(tokens, output='nested') for i,morpheme in enumerate(morphemes): syllables += [' '.join(morpheme)] try: M[' '.join(morpheme)] += [(c, idx, i)] except KeyError: M[' '.join(morpheme)] = [(c, idx, i)] # add morpheme information for m,v in M.items(): for i, (c1, idx1, pos1) in enumerate(v): for j, (c2, idx2, pos2) in enumerate(v): if i < j: try: G.edge[c1][c2]['weight'] += 1 G.edge[c1][c2]['words'] += [(idx1,pos1, idx2, pos2)]
else: cogids[tmp] = cidx cidx += 1 for idx in wl: cogids_ = wl[idx, 'cogids'] print(wl[idx, 'concept'], cogids_) wl[idx, 'cogids'] = [ cogids[wl[idx, 'concept'] + '-' + str(x)] for x in cogids_ ] print(cogids_, wl[idx, 'cogids']) for idx, cogids, tokens, doculect, concept in iter_rows( wl, 'cogids', 'tokens', 'doculect', 'concept'): print(idx, doculect, concept, tokens) cls = tokens2class(tokens, 'cv') morphemes = tokens2morphemes(tokens, sep="+") wl.output('tsv', filename=burmish_path('burmish-proto')) alm = Alignments(burmish_path('burmish-proto.tsv'), ref='cogids', alignment='alignments') etd = alm.get_etymdict(ref='cogids') for k, vals in etd.items(): idxs = [v[0] for v in vals if v] concept = alm[idxs[0], 'concept'] proto_concepts[concept].add(k) alm.align() cons = alm.get_consensus(ref='cogids', counterpart='tokens', return_data=True, gaps=True)
def prepare(ds): # steps: # parse characters (numbers, zeros) # check for number # recreate partial cognate identifiers # create strict cognate identifieres # code everything as CLDF-like file con = Concepticon() beida = con.conceptlists['BeijingDaxue-1964-905'] inv = ds.sounds words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv')) partialids, pidc = {}, {} pidx = 1 concepts = {} for idx, chars, tks, doculect, glossid in iter_rows( words, 'benzi', 'segments', 'doculect', 'beida_id'): tokens = tokens2morphemes(tks) benzi = parse_chars(chars, doculect, tokens) if len(tokens) != len(benzi): print(doculect, glossid, benzi, tokens) pids = [] for char in benzi: if char == '囗': pids += [str(pidx)] pidx += 1 else: if char not in partialids: partialids[char] = str(pidx) pidx += 1 pids += [partialids[char]] words[idx, 'cogids'] = ' '.join(pids) words[idx, 'benzi'] = ' '.join(benzi) # retrieve correct concept bidx = 'BeijingDaxue-1964-905-' + glossid concept = beida.concepts[bidx] concepts[idx] = [ concept.concepticon_id, concept.attributes['chinese'], concept.attributes['pageno'], concept.attributes['pinyin'] ] words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[ 'pinyin'] + ' ' + concept.attributes['chinese'] + ')' for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']): words.add_entries(entry, concepts, lambda x: x[i]) words.add_entries('benzi_in_source', 'hanzi', lambda x: x) words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964') words.add_entries('value', 'ipa', lambda x: x) words.add_entries('form', 'ipa', lambda x: x) words.add_entries('glottolog', 'doculect', lambda x: ds.languages[x]['glottolog']) words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso']) # determine order of entries order = {} for d in words.cols: entries = words.get_list(col=d, flat=True) concept, oid = '', 1 for idx in sorted(entries): new_concept = words[idx, 'concept'] if new_concept == concept: oid += 1 else: concept = new_concept oid = 1 order[idx] = oid words.add_entries('order', order, lambda x: str(x)) words.output('tsv', filename=ds.raw('tmp-2017-06-18')) print('first run on words') part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments') part.add_cognate_ids('cogids', 'cogid') part.output('tsv', filename=ds.raw('tmp-2017-06-18')) print('created cognate ids') alm = Alignments(ds.raw('tmp-2017-06-18.tsv'), segments='segments', ref='cogids', alignment='alignments') alm.align() alm.output('tsv', filename=ds.raw('tmp-2017-06-18-finalized'), subset=True, cols=[ 'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id', 'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value', 'form', 'segments', 'cogid', 'cogids', 'note', 'source', 'beida_id', 'page', 'order', 'alignments' ]) words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv')) ds.write_wordlist(words) with open('cldf/beijingdaxue1964.csv', 'w') as f: f.write(','.join([ 'ID', 'Language_name', 'Language_ID', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Source', 'Comment', 'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form', 'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order', 'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source' ]) + '\n') for idx in words: out = [str(idx)] for entry in [ 'doculect', 'glottolog', 'iso', 'concepticon_id', 'concept', 'source', 'note', 'chinese', 'pinyin', 'value', 'form', 'segments', 'cogid', 'cogids', 'alignments', 'order', 'beida_id', 'page', 'benzi', 'benzi_in_source' ]: value = words[idx, entry] if isinstance(value, list): value = ' '.join([str(x) for x in value]) else: value = str(value) if '"' in value: value = value.replace('"', '""') if ',' in value: value = '"' + value + '"' out += [value] f.write(','.join(out) + '\n')
def prepare_old2(ds): converter = { '豬肉': '肉', '豬艤': '艤', '! □水': '口水', '! 一□水': '一口水', '星〔星兒〕': '星', "一串兒葡萄": "一串葡萄", "一小片兒草": "一小片草", "一串兒葡萄": "一串葡萄", "一抓兒葡萄": "一抓葡萄", "手套兒": "手套", "茄兒如": "茄如", "前兒日": "前日", "前兒個": "前個", "明兒個": "明個", "明兒個": "明個", "今兒個": "今個", "今兒日": "今日", "黃花兒魚": "黃花魚", "大前兒個": "大前個", "大前兒日": "大前日", "大後兒個": "大後個", } bad_list = [] visited = [] inv = ds.sounds words = Wordlist(ds.raw('words-2017-06-16.tsv')) weilist = [] pids = {} pidx = 1 characters, partialcogs = {}, {} blacklist = [] for idx, bid, segments, chars, note in iter_rows(words, 'beida_id', 'segments', 'hanzi', 'note'): if 'ignore' in note: blacklist += [idx] else: ochars = chars chars = converter.get(chars, chars) chars = re.sub('〔[^〕]+〕', '', chars) chars = re.sub('<[^>]+>', '', chars) chars = chars.replace('□', '囗') chars = chars.replace('?', '') chars = ''.join( [c for c in chars.split(',')[0] if sp.is_chinese(c)]) tks = tokens2morphemes(segments) partials = [] if len(tks) == len(chars): for char in chars: if char in pids and char != '囗': partials += [str(pids[char])] else: pids[char] = pidx pidx += 1 partials += [str(pids[char])] else: if chars.endswith('兒'): if len(chars) - 1 == len(tks): for char in chars[:-1]: if char in pids and char != '囗': partials += [str(pids[char])] else: pids[char] = pidx pidx += 1 partials += [str(pids[char])] else: for tk in tks: partials += [str(pidx)] pidx += 1 bad_list += [idx] print(len(bad_list), chars, len(tks), bid) elif not chars: weilist += [idx] for tk in tks: partials += [str(pidx)] pidx += 1 chars = '?' + chars elif '囗' in chars: weilist += [idx] for tk in tks: partials += [str(pidx)] pidx += 1 chars = '!' + chars else: for tk in tks: partials += [str(pidx)] pidx += 1 bad_list += [idx] print(len(bad_list), ochars, '|', '\t|', chars, len(tks), bid) chars = ':' + chars characters[idx] = chars partialcogs[idx] = ' '.join(partials) print(len(weilist)) words.output('tsv', filename=ds.raw('words.tmp'), subset=True, rows=dict(ID='not in ' + str(blacklist))) words = Wordlist(ds.raw('words.tmp.tsv')) words.add_entries('benzi', characters, lambda x: x) words.add_entries('cogids', partialcogs, lambda x: x) ds.write_wordlist(words)
if tmp in cogids: pass else: cogids[tmp] = cidx cidx += 1 for idx in wl: cogids_ = wl[idx, 'cogids'] print(wl[idx, 'concept'], cogids_) wl[idx, 'cogids'] = [cogids[wl[idx, 'concept']+'-'+str(x)] for x in cogids_] print(cogids_, wl[idx, 'cogids']) for idx, cogids, tokens, doculect, concept in iter_rows(wl, 'cogids', 'tokens', 'doculect', 'concept'): print(idx, doculect, concept, tokens) cls = tokens2class(tokens, 'cv') morphemes = tokens2morphemes(tokens, sep="+") wl.output('tsv', filename=burmish_path('burmish-proto')) alm = Alignments(burmish_path('burmish-proto.tsv'), ref='cogids', alignment='alignments') etd = alm.get_etymdict(ref='cogids') for k, vals in etd.items(): idxs = [v[0] for v in vals if v] concept = alm[idxs[0], 'concept'] proto_concepts[concept].add(k) alm.align() cons = alm.get_consensus(ref='cogids', counterpart='tokens', return_data=True, gaps=True) proto = {} idx = max(wl)+1 for concept, vals in proto_concepts.items():
wl = Wordlist('../tsv/burmish.tsv') G = nx.Graph() txt = '' for taxon in wl.taxa: M = {} data = wl.get_dict(taxon=taxon, flat=True) words = [] syllables = [] for c, idxs in data.items(): for idx in idxs: tokens = wl[idx, 'tokens'] words += [' '.join(tokens)] morphemes = tokens2morphemes(tokens, output='nested') for i, morpheme in enumerate(morphemes): syllables += [' '.join(morpheme)] try: M[' '.join(morpheme)] += [(c, idx, i)] except KeyError: M[' '.join(morpheme)] = [(c, idx, i)] # add morpheme information for m, v in M.items(): for i, (c1, idx1, pos1) in enumerate(v): for j, (c2, idx2, pos2) in enumerate(v): if i < j: try: G.edge[c1][c2]['weight'] += 1 G.edge[c1][c2]['words'] += [(idx1, pos1, idx2, pos2)]
G = nx.Graph() txt = '' MO = {} for taxon in wl.taxa: M = {} data = wl.get_dict(taxon=taxon, flat=True) words = [] syllables = [] for c,idxs in data.items(): for idx in idxs: tokens = wl[idx, 'tokens'] words += [' '.join(tokens)] morphemes = tokens2morphemes(tokens, output='nested') for i,morpheme in enumerate(morphemes): syllables += [' '.join(morpheme)] try: M[' '.join(morpheme)] += [(c, idx, i)] except KeyError: M[' '.join(morpheme)] = [(c, idx, i)] MO[taxon] = M # add morpheme information for m,v in M.items(): for i, (c1, idx1, pos1) in enumerate(v): for j, (c2, idx2, pos2) in enumerate(v): if i < j: try: G.edge[c1][c2]['weight'] += 1 G.edge[c1][c2]['words'] += [(idx1,pos1, idx2, pos2)]
def cv_templates(wordlist, language, segments='tokens', converter=None, cutoff=0.1, output='markdown', examples=3, scoredict=None, splitter=False): """Create CV templates from wordlist data.""" templates = defaultdict(list) idxs = wordlist.get_list(col=language, flat=True) sounds = defaultdict(list) def str_(list_): return ', '.join([' '.join(l) for l in list_[:examples]]) if not converter: converter = lambda x: prosodic_string(x, _output='CcV') scoredict = scoredict or _scorer() if not splitter: splitter = lambda x: filter(None, tokens2morphemes(x)) for idx in idxs: segs = wordlist[idx, segments] for word in splitter(segs): cv = converter(word) templates[cv] += [word] for sound, symbol in zip(word, cv): sounds[sound, symbol] += [word] # retrieve percentile lengths = sum([len(v) for v in templates.values()]) perc = lengths - (cutoff * lengths) patterns, ignored = [], [] score = 0 for k, v in sorted(templates.items(), key=lambda x: len(x[1]), reverse=True): l = len(v) if score + l > perc: ignored += [[k, l, v]] else: patterns += [[k, l, v]] score += l # compute pattern consensus consensus = pattern_consensus([list(p[0]) for p in patterns], scoredict) # extract initials sound_table = [] for k, v in sorted(sounds.items(), key=lambda x: (x[0][1], len(x[1]))): sound_table += [(k[0], k[1], len(v), v)] if output == 'markdown': out = 'Pattern | Frequency | Examples\n --- | --- | --- \n' score = 0 for i, (p, l, v) in enumerate(patterns): out += '{0:15} | {1:5} | {2}\n'.format(p, l, str_(v)) score += l count = 1 out += '\nSound | Context | Frequency | Examples\n --- | --- | --- | --- \n' for sound, context, l, vals in sound_table: out += '{0} | {1} | {2} | {3} \n'.format(sound, context, l, str_(vals)) out += '\n* **coverage:** {0} out of {1} patterns in the data\n'.format( score, lengths) out += '* **pattern consensus:** {0}\n'.format(' '.join(consensus)) return out return patterns, ignored, sound_table
G = nx.Graph() txt = '' MO = {} for taxon in wl.taxa: M = {} data = wl.get_dict(taxon=taxon, flat=True) words = [] syllables = [] for c, idxs in data.items(): for idx in idxs: tokens = wl[idx, 'tokens'] words += [' '.join(tokens)] morphemes = tokens2morphemes(tokens, output='nested') for i, morpheme in enumerate(morphemes): syllables += [' '.join(morpheme)] try: M[' '.join(morpheme)] += [(c, idx, i)] except KeyError: M[' '.join(morpheme)] = [(c, idx, i)] MO[taxon] = M # add morpheme information for m, v in M.items(): for i, (c1, idx1, pos1) in enumerate(v): for j, (c2, idx2, pos2) in enumerate(v): if i < j: try: G.edge[c1][c2]['weight'] += 1 G.edge[c1][c2]['words'] += [(idx1, pos1, idx2, pos2)]