Example #1
0
def test_tokens2morphemes():

    seq1 = "t i a o ¹ b u ² d a o".split(' ')
    seq2 = "t i a o ¹ + b u ² # d a o".split(' ')
    seq3 = "t i a o ¹ b u _ d a o".split(' ')
    seq4 = "t i a o murks w a o".split(' ')

    assert len(tokens2morphemes(seq1)) == 3
    assert len(tokens2morphemes(seq2)) == 3
    assert len(tokens2morphemes(seq3)) == 2
    assert len(tokens2morphemes(seq4, sep='murks')) == 2
    assert_raises(ValueError, tokens2morphemes, "t i a o")
Example #2
0
def test_tokens2morphemes():

    seq1 = "t i a o ¹ b u ² d a o".split(' ')
    seq2 = "t i a o ¹ + b u ² # d a o".split(' ')
    seq3 = "t i a o ¹ b u _ d a o".split(' ')
    seq4 = "t i a o murks w a o".split(' ')

    assert len(tokens2morphemes(seq1)) == 3
    assert len(tokens2morphemes(seq2)) == 3
    assert len(tokens2morphemes(seq3)) == 2
    assert len(tokens2morphemes(seq4, sep='murks')) == 2
    assert_raises(ValueError, tokens2morphemes, "t i a o")
Example #3
0
def test_tokens2morphemes():
    seq1 = "t i a o ¹ b u ² d a o".split(' ')
    seq2 = "t i a o ¹ + b u ² # d a o".split(' ')
    seq3 = "t i a o ¹ b u _ d a o".split(' ')
    seq4 = "t i a o murks w a o".split(' ')

    assert len(tokens2morphemes(seq1)) == 3
    assert len(tokens2morphemes(seq2)) == 3
    assert len(tokens2morphemes(seq3)) == 2
    assert len(tokens2morphemes(seq4, sep='murks')) == 2
    assert len(tokens2morphemes(seq1, split_on_tones=False)) == 1
    with pytest.raises(ValueError):
        tokens2morphemes("t i a o")
    assert len(tokens2morphemes(list("b++t"))) == 2
Example #4
0
def get_structure(word,
                  sep='+',
                  zipped=False,
                  semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'):
    if not isinstance(word, (list, tuple)):
        word = lingpy.ipa2tokens(word,
                                 expand_nasals=True,
                                 merge_vowels=False,
                                 semi_diacritics=semi_diacritics)

    # check for unknown chars
    try:
        tokens2class(word, 'cv', cldf=True)
    except ValueError:
        print('problem with {0}'.format(''.join(word)))
        return []

    # get the morphemes
    if sep in word:
        words = tokens2morphemes(word, cldf=True)
        morphemes = []
        for w in words:
            morphemes += tokens2morphemes(w, sep=sep)
    else:
        morphemes = tokens2morphemes(word, cldf=True)
    # get the basic structure for each morpheme
    for morpheme in morphemes:
        try:
            segments = parse_chinese_morphemes(morpheme)
        except:
            if not zipped:
                yield ['NULL']
            else:
                yield ([('NULL', 'NULL')], morpheme)
        if not zipped:
            yield [x for x, y in zip('imnct', segments) if y != '-']
        else:
            yield ([x for x in zip('imnct', segments)
                    if x[1] != '-'], morpheme)
Example #5
0
    def test_tokens2morphemes(self):
        seq1 = "t i a o ¹ b u ² d a o".split(' ')
        seq2 = "t i a o ¹ + b u ² # d a o".split(' ')
        seq3 = "t i a o ¹ b u _ d a o".split(' ')
        seq4 = "t i a o murks w a o".split(' ')

        assert len(tokens2morphemes(seq1)) == 3
        assert len(tokens2morphemes(seq2)) == 3
        assert len(tokens2morphemes(seq3)) == 2
        assert len(tokens2morphemes(seq4, sep='murks')) == 2
        assert len(tokens2morphemes(seq1, split_on_tones=False)) == 1
        assert_raises(ValueError, tokens2morphemes, "t i a o")
        assert len(tokens2morphemes(list("b++t"))) == 2
Example #6
0
wl = Wordlist('../tsv/burmish.tsv')

G = nx.Graph()
txt = ''
for taxon in wl.taxa:

    M = {}
    data = wl.get_dict(taxon=taxon, flat=True)
    words = []
    syllables = []
    for c,idxs in data.items():
    
        for idx in idxs:
            tokens = wl[idx, 'tokens']
            words += [' '.join(tokens)]
            morphemes = tokens2morphemes(tokens, output='nested')
            for i,morpheme in enumerate(morphemes):
                syllables += [' '.join(morpheme)]
                try:
                    M[' '.join(morpheme)] += [(c, idx, i)]
                except KeyError:
                    M[' '.join(morpheme)] = [(c, idx, i)]
    
    # add morpheme information
    for m,v in M.items():
        for i, (c1, idx1, pos1) in enumerate(v):
            for j, (c2, idx2, pos2) in enumerate(v):
                if i < j:
                    try:
                        G.edge[c1][c2]['weight'] += 1
                        G.edge[c1][c2]['words'] += [(idx1,pos1, idx2, pos2)]
Example #7
0
        else:
            cogids[tmp] = cidx
            cidx += 1
for idx in wl:
    cogids_ = wl[idx, 'cogids']
    print(wl[idx, 'concept'], cogids_)
    wl[idx, 'cogids'] = [
        cogids[wl[idx, 'concept'] + '-' + str(x)] for x in cogids_
    ]
    print(cogids_, wl[idx, 'cogids'])

for idx, cogids, tokens, doculect, concept in iter_rows(
        wl, 'cogids', 'tokens', 'doculect', 'concept'):
    print(idx, doculect, concept, tokens)
    cls = tokens2class(tokens, 'cv')
    morphemes = tokens2morphemes(tokens, sep="+")
wl.output('tsv', filename=burmish_path('burmish-proto'))
alm = Alignments(burmish_path('burmish-proto.tsv'),
                 ref='cogids',
                 alignment='alignments')
etd = alm.get_etymdict(ref='cogids')
for k, vals in etd.items():
    idxs = [v[0] for v in vals if v]
    concept = alm[idxs[0], 'concept']
    proto_concepts[concept].add(k)

alm.align()
cons = alm.get_consensus(ref='cogids',
                         counterpart='tokens',
                         return_data=True,
                         gaps=True)
Example #8
0
def prepare(ds):

    # steps:
    # parse characters (numbers, zeros)
    # check for number
    # recreate partial cognate identifiers
    # create strict cognate identifieres
    # code everything as CLDF-like file
    con = Concepticon()
    beida = con.conceptlists['BeijingDaxue-1964-905']
    inv = ds.sounds
    words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv'))
    partialids, pidc = {}, {}
    pidx = 1
    concepts = {}
    for idx, chars, tks, doculect, glossid in iter_rows(
            words, 'benzi', 'segments', 'doculect', 'beida_id'):
        tokens = tokens2morphemes(tks)
        benzi = parse_chars(chars, doculect, tokens)
        if len(tokens) != len(benzi):
            print(doculect, glossid, benzi, tokens)
        pids = []
        for char in benzi:
            if char == '囗':
                pids += [str(pidx)]
                pidx += 1
            else:
                if char not in partialids:
                    partialids[char] = str(pidx)
                    pidx += 1
                pids += [partialids[char]]
        words[idx, 'cogids'] = ' '.join(pids)
        words[idx, 'benzi'] = ' '.join(benzi)

        # retrieve correct concept
        bidx = 'BeijingDaxue-1964-905-' + glossid
        concept = beida.concepts[bidx]
        concepts[idx] = [
            concept.concepticon_id, concept.attributes['chinese'],
            concept.attributes['pageno'], concept.attributes['pinyin']
        ]
        words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[
            'pinyin'] + ' ' + concept.attributes['chinese'] + ')'
    for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']):
        words.add_entries(entry, concepts, lambda x: x[i])
    words.add_entries('benzi_in_source', 'hanzi', lambda x: x)
    words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964')
    words.add_entries('value', 'ipa', lambda x: x)
    words.add_entries('form', 'ipa', lambda x: x)
    words.add_entries('glottolog', 'doculect',
                      lambda x: ds.languages[x]['glottolog'])
    words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso'])

    # determine order of entries
    order = {}
    for d in words.cols:
        entries = words.get_list(col=d, flat=True)
        concept, oid = '', 1
        for idx in sorted(entries):
            new_concept = words[idx, 'concept']
            if new_concept == concept:
                oid += 1
            else:
                concept = new_concept
                oid = 1
            order[idx] = oid
    words.add_entries('order', order, lambda x: str(x))

    words.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('first run on words')
    part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments')
    part.add_cognate_ids('cogids', 'cogid')
    part.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('created cognate ids')
    alm = Alignments(ds.raw('tmp-2017-06-18.tsv'),
                     segments='segments',
                     ref='cogids',
                     alignment='alignments')
    alm.align()
    alm.output('tsv',
               filename=ds.raw('tmp-2017-06-18-finalized'),
               subset=True,
               cols=[
                   'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id',
                   'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value',
                   'form', 'segments', 'cogid', 'cogids', 'note', 'source',
                   'beida_id', 'page', 'order', 'alignments'
               ])
    words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv'))
    ds.write_wordlist(words)
    with open('cldf/beijingdaxue1964.csv', 'w') as f:
        f.write(','.join([
            'ID', 'Language_name', 'Language_ID', 'Language_iso',
            'Parameter_ID', 'Parameter_name', 'Source', 'Comment',
            'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form',
            'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order',
            'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source'
        ]) + '\n')
        for idx in words:
            out = [str(idx)]
            for entry in [
                    'doculect', 'glottolog', 'iso', 'concepticon_id',
                    'concept', 'source', 'note', 'chinese', 'pinyin', 'value',
                    'form', 'segments', 'cogid', 'cogids', 'alignments',
                    'order', 'beida_id', 'page', 'benzi', 'benzi_in_source'
            ]:
                value = words[idx, entry]
                if isinstance(value, list):
                    value = ' '.join([str(x) for x in value])
                else:
                    value = str(value)
                if '"' in value:
                    value = value.replace('"', '""')
                if ',' in value:
                    value = '"' + value + '"'
                out += [value]
            f.write(','.join(out) + '\n')
Example #9
0
def prepare_old2(ds):

    converter = {
        '豬肉': '肉',
        '豬艤': '艤',
        '! □水': '口水',
        '! 一□水': '一口水',
        '星〔星兒〕': '星',
        "一串兒葡萄": "一串葡萄",
        "一小片兒草": "一小片草",
        "一串兒葡萄": "一串葡萄",
        "一抓兒葡萄": "一抓葡萄",
        "手套兒": "手套",
        "茄兒如": "茄如",
        "前兒日": "前日",
        "前兒個": "前個",
        "明兒個": "明個",
        "明兒個": "明個",
        "今兒個": "今個",
        "今兒日": "今日",
        "黃花兒魚": "黃花魚",
        "大前兒個": "大前個",
        "大前兒日": "大前日",
        "大後兒個": "大後個",
    }
    bad_list = []
    visited = []
    inv = ds.sounds
    words = Wordlist(ds.raw('words-2017-06-16.tsv'))
    weilist = []
    pids = {}
    pidx = 1
    characters, partialcogs = {}, {}
    blacklist = []
    for idx, bid, segments, chars, note in iter_rows(words, 'beida_id',
                                                     'segments', 'hanzi',
                                                     'note'):
        if 'ignore' in note:
            blacklist += [idx]
        else:
            ochars = chars
            chars = converter.get(chars, chars)
            chars = re.sub('〔[^〕]+〕', '', chars)
            chars = re.sub('<[^>]+>', '', chars)
            chars = chars.replace('□', '囗')
            chars = chars.replace('?', '')
            chars = ''.join(
                [c for c in chars.split(',')[0] if sp.is_chinese(c)])
            tks = tokens2morphemes(segments)
            partials = []
            if len(tks) == len(chars):
                for char in chars:
                    if char in pids and char != '囗':
                        partials += [str(pids[char])]
                    else:
                        pids[char] = pidx
                        pidx += 1
                        partials += [str(pids[char])]
            else:
                if chars.endswith('兒'):
                    if len(chars) - 1 == len(tks):
                        for char in chars[:-1]:
                            if char in pids and char != '囗':
                                partials += [str(pids[char])]
                            else:
                                pids[char] = pidx
                                pidx += 1
                                partials += [str(pids[char])]
                    else:
                        for tk in tks:
                            partials += [str(pidx)]
                            pidx += 1
                        bad_list += [idx]
                        print(len(bad_list), chars, len(tks), bid)
                elif not chars:
                    weilist += [idx]
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    chars = '?' + chars
                elif '囗' in chars:
                    weilist += [idx]
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    chars = '!' + chars
                else:
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    bad_list += [idx]
                    print(len(bad_list), ochars, '|', '\t|', chars, len(tks),
                          bid)
                    chars = ':' + chars
            characters[idx] = chars
            partialcogs[idx] = ' '.join(partials)
    print(len(weilist))
    words.output('tsv',
                 filename=ds.raw('words.tmp'),
                 subset=True,
                 rows=dict(ID='not in ' + str(blacklist)))
    words = Wordlist(ds.raw('words.tmp.tsv'))
    words.add_entries('benzi', characters, lambda x: x)
    words.add_entries('cogids', partialcogs, lambda x: x)
    ds.write_wordlist(words)
Example #10
0
        if tmp in cogids:
            pass
        else:
            cogids[tmp] = cidx
            cidx += 1
for idx in wl:
    cogids_ = wl[idx, 'cogids']
    print(wl[idx, 'concept'], cogids_)
    wl[idx, 'cogids'] = [cogids[wl[idx, 'concept']+'-'+str(x)] for x in cogids_]
    print(cogids_, wl[idx, 'cogids'])

for idx, cogids, tokens, doculect, concept in iter_rows(wl, 'cogids', 'tokens',
        'doculect', 'concept'):
    print(idx, doculect, concept, tokens)
    cls = tokens2class(tokens, 'cv')
    morphemes = tokens2morphemes(tokens, sep="+")
wl.output('tsv', filename=burmish_path('burmish-proto'))
alm = Alignments(burmish_path('burmish-proto.tsv'), ref='cogids',
        alignment='alignments')
etd = alm.get_etymdict(ref='cogids')
for k, vals in etd.items():
    idxs = [v[0] for v in vals if v]
    concept = alm[idxs[0], 'concept']
    proto_concepts[concept].add(k)
    
alm.align()
cons = alm.get_consensus(ref='cogids', counterpart='tokens', return_data=True,
        gaps=True)
proto = {}
idx = max(wl)+1
for concept, vals in proto_concepts.items():
Example #11
0
wl = Wordlist('../tsv/burmish.tsv')

G = nx.Graph()
txt = ''
for taxon in wl.taxa:

    M = {}
    data = wl.get_dict(taxon=taxon, flat=True)
    words = []
    syllables = []
    for c, idxs in data.items():

        for idx in idxs:
            tokens = wl[idx, 'tokens']
            words += [' '.join(tokens)]
            morphemes = tokens2morphemes(tokens, output='nested')
            for i, morpheme in enumerate(morphemes):
                syllables += [' '.join(morpheme)]
                try:
                    M[' '.join(morpheme)] += [(c, idx, i)]
                except KeyError:
                    M[' '.join(morpheme)] = [(c, idx, i)]

    # add morpheme information
    for m, v in M.items():
        for i, (c1, idx1, pos1) in enumerate(v):
            for j, (c2, idx2, pos2) in enumerate(v):
                if i < j:
                    try:
                        G.edge[c1][c2]['weight'] += 1
                        G.edge[c1][c2]['words'] += [(idx1, pos1, idx2, pos2)]
G = nx.Graph()
txt = ''
MO = {}
for taxon in wl.taxa:

    M = {}
    data = wl.get_dict(taxon=taxon, flat=True)
    words = []
    syllables = []
    for c,idxs in data.items():
    
        for idx in idxs:
            tokens = wl[idx, 'tokens']
            words += [' '.join(tokens)]
            morphemes = tokens2morphemes(tokens, output='nested')
            for i,morpheme in enumerate(morphemes):
                syllables += [' '.join(morpheme)]
                try:
                    M[' '.join(morpheme)] += [(c, idx, i)]
                except KeyError:
                    M[' '.join(morpheme)] = [(c, idx, i)]
    MO[taxon] = M
    # add morpheme information
    for m,v in M.items():
        for i, (c1, idx1, pos1) in enumerate(v):
            for j, (c2, idx2, pos2) in enumerate(v):
                if i < j:
                    try:
                        G.edge[c1][c2]['weight'] += 1
                        G.edge[c1][c2]['words'] += [(idx1,pos1, idx2, pos2)]
Example #13
0
def cv_templates(wordlist,
                 language,
                 segments='tokens',
                 converter=None,
                 cutoff=0.1,
                 output='markdown',
                 examples=3,
                 scoredict=None,
                 splitter=False):
    """Create CV templates from wordlist data."""
    templates = defaultdict(list)
    idxs = wordlist.get_list(col=language, flat=True)
    sounds = defaultdict(list)

    def str_(list_):
        return ', '.join([' '.join(l) for l in list_[:examples]])

    if not converter:
        converter = lambda x: prosodic_string(x, _output='CcV')
    scoredict = scoredict or _scorer()
    if not splitter:
        splitter = lambda x: filter(None, tokens2morphemes(x))

    for idx in idxs:
        segs = wordlist[idx, segments]
        for word in splitter(segs):
            cv = converter(word)
            templates[cv] += [word]
            for sound, symbol in zip(word, cv):
                sounds[sound, symbol] += [word]

    # retrieve percentile
    lengths = sum([len(v) for v in templates.values()])
    perc = lengths - (cutoff * lengths)

    patterns, ignored = [], []
    score = 0
    for k, v in sorted(templates.items(),
                       key=lambda x: len(x[1]),
                       reverse=True):
        l = len(v)
        if score + l > perc:
            ignored += [[k, l, v]]
        else:
            patterns += [[k, l, v]]
        score += l

    # compute pattern consensus
    consensus = pattern_consensus([list(p[0]) for p in patterns], scoredict)

    # extract initials
    sound_table = []
    for k, v in sorted(sounds.items(), key=lambda x: (x[0][1], len(x[1]))):
        sound_table += [(k[0], k[1], len(v), v)]

    if output == 'markdown':
        out = 'Pattern | Frequency | Examples\n --- | --- | --- \n'
        score = 0
        for i, (p, l, v) in enumerate(patterns):
            out += '{0:15} | {1:5} | {2}\n'.format(p, l, str_(v))
            score += l
        count = 1
        out += '\nSound | Context | Frequency | Examples\n --- | --- | --- | --- \n'
        for sound, context, l, vals in sound_table:
            out += '{0} | {1} | {2} | {3} \n'.format(sound, context, l,
                                                     str_(vals))

        out += '\n* **coverage:** {0} out of {1} patterns in the data\n'.format(
            score, lengths)
        out += '* **pattern consensus:** {0}\n'.format(' '.join(consensus))
        return out

    return patterns, ignored, sound_table
G = nx.Graph()
txt = ''
MO = {}
for taxon in wl.taxa:

    M = {}
    data = wl.get_dict(taxon=taxon, flat=True)
    words = []
    syllables = []
    for c, idxs in data.items():

        for idx in idxs:
            tokens = wl[idx, 'tokens']
            words += [' '.join(tokens)]
            morphemes = tokens2morphemes(tokens, output='nested')
            for i, morpheme in enumerate(morphemes):
                syllables += [' '.join(morpheme)]
                try:
                    M[' '.join(morpheme)] += [(c, idx, i)]
                except KeyError:
                    M[' '.join(morpheme)] = [(c, idx, i)]
    MO[taxon] = M
    # add morpheme information
    for m, v in M.items():
        for i, (c1, idx1, pos1) in enumerate(v):
            for j, (c2, idx2, pos2) in enumerate(v):
                if i < j:
                    try:
                        G.edge[c1][c2]['weight'] += 1
                        G.edge[c1][c2]['words'] += [(idx1, pos1, idx2, pos2)]