def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'

    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'

    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'

    assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2

    # now check with all possible data we have so far, but only on cases where
    # tokenization doesn't require the merge_vowels = False flag
    tokens = csv2list(test_data('test_tokenization.tsv'))

    for a, b in tokens:

        tks = ' '.join(ipa2tokens(a))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    # now test on smaller set with unmerged vowels
    tokens = csv2list(test_data('test_tokenization_mv.tsv'))

    for a, b in tokens:

        tks = ' '.join(ipa2tokens(a, merge_vowels=False,
                                  merge_geminates=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    tokens = csv2list(test_data('test_tokenization_nasals.tsv'))
    for a, b in tokens:
        tks = ' '.join(
            ipa2tokens(a,
                       merge_vowels=True,
                       merge_geminates=True,
                       expand_nasals=True,
                       semi_diacritics='h'))
        assert tks == b
Exemple #2
0
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'

    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'

    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'

    assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2

    # now check with all possible data we have so far
    tokens = csv2list(test_data('test_tokenization.csv'))

    for a, b in tokens:

        tks1 = ' '.join(ipa2tokens(a))
        tks2 = ' '.join(ipa2tokens(a, merge_vowels=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks1 == b or tks2 == b
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'
    
    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))-2

    # now check with all possible data we have so far
    tokens = csv2list(test_data('test_tokenization.csv'))
    
    for a,b in tokens:
        
        tks1 = ' '.join(ipa2tokens(a))
        tks2 = ' '.join(ipa2tokens(a, merge_vowels=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks1 == b or tks2 == b
def test_ipa2tokens():

    seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'

    assert len(ipa2tokens(seq)) != len(list(seq))

    seq = 'ʰto͡i'
    
    assert len(ipa2tokens(seq)) == 2

    seq = 'th o x t a'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))

    seq = '# b l a #'
    
    assert len(ipa2tokens(seq)) == len(seq.split(' '))-2

    # now check with all possible data we have so far, but only on cases where
    # tokenization doesn't require the merge_vowels = False flag
    tokens = csv2list(test_data('test_tokenization.tsv'))
    
    for a,b in tokens:
        
        tks = ' '.join(ipa2tokens(a))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    # now test on smaller set with unmerged vowels 
    tokens = csv2list(test_data('test_tokenization_mv.tsv'))
    
    for a,b in tokens:
        
        tks = ' '.join(ipa2tokens(a, merge_vowels=False, merge_geminates=False))

        # we check for two variants, since we don't know whether vowels are
        # merged or not in the test data
        assert tks == b

    tokens = csv2list(test_data('test_tokenization_nasals.tsv'))
    for a,b in tokens:
        print(tks)
        tks = ' '.join(ipa2tokens(a, merge_vowels=True, merge_geminates=True,
            expand_nasals=True, semi_diacritics='h'))
        assert tks == b
Exemple #5
0
def get_unihan():
    _unihan = lingpy.csv2list(_path('Unihan_Readings.txt'))
    unihan = defaultdict(dict)
    for line in _unihan:
        key = eval('"""' + line[0] + '"""')
        if line[1] == 'kHanyuPinyin':
            unihan[key]['pinyin'] = line[2].split(':')[1]
        else:
            unihan[key][line[1][1:].lower()] = line[2]
    return unihan
Exemple #6
0
def get_transformer(profile, exception=None):
    
    profile = lp.csv2list(cddb_path('profiles', profile), strip_lines=False)
    for i, line in enumerate(profile):
        profile[i] = [unicodedata.normalize('NFD', clpa.normalize(x)) for x in line]
    tokenizer = Tokenizer(profile, errors_replace=lambda x: "«{0}»".format(x))
    
    return lambda x, y: unicodedata.normalize(
            'NFC',
            tokenizer.transform(clpa.normalize(x), column=y, separator=' + ')
            )
Exemple #7
0
def get_ids():
    _ids = lingpy.csv2list(_path('ids.txt'))
    ids = {}
    for line in _ids:
        char = line[1]
        motivations = line[2:]
        for motivation in motivations:
            if '[' in motivation:
                motivation = motivation[:motivation.index('[')]
            ids[motivation] = char
    return ids
Exemple #8
0
def inventories(ds):
    data = csv2list(ds.raw('inv.tsv'))
    header = data[0]
    invs = {l: [] for l in ds.languages}
    for i, line in enumerate(data[1:]):
        stype, sis, ipa, struc = line[1:5]
        if len(struc.split()) != len(ipa.split()):
            print(i + 2, 'warn', struc, '  |  ', ipa)
        for l, n in zip(header[5:], line[5:]):
            if n:
                note = '' if 'X' else n
                invs[l] += [[sis, ipa, struc, stype, note]]
    ds.write_inventories(invs)
Exemple #9
0
def prepare(dataset):

    data = lingpy.csv2list(dataset.get_path('raw', 'data-starostin.tsv'),
                           strip_lines=False)
    header = [h.lower() for h in data[0]]
    out = {}
    idx = 1
    for line in data[1:]:
        char = line[0]
        coc = line[2]
        bijiang = line[1]
        note = line[3]
        dali = line[4]
        doc_url = line[5]
        lhc = line[7]
        gloss = line[8]
        jianchuan = line[12]
        kg = line[14]
        mch = line[16]
        pinyin = line[18]
        rad = line[20]
        shijing = line[21]

        if coc.strip():
            out[idx] = [
                char, pinyin, 'Old_Chinese', 'Classical Old Chinese', coc, rad,
                kg[:4], kg, gloss
            ]
            idx += 1
        if lhc.strip():
            out[idx] = [
                char, pinyin, 'Late_Han_Chinese', 'Eastern Han Chinese', lhc,
                rad, kg[:4], kg, gloss
            ]
            idx += 1
        if mch.strip():
            out[idx] = [
                char, pinyin, 'Middle_Chinese', 'Middle Chinese', mch, rad,
                kg[:4], kg, gloss
            ]
            idx += 1
    out[0] = [
        'character', 'pinyin', 'doculect', 'doculect_in_source', 'reading',
        'semantic_class', 'phonetic_class', 'karlgren_id', 'gloss'
    ]
    dataset.write_wordlist(lingpy.Wordlist(out, row='character'), 'characters')
Exemple #10
0
    def cmd_makecldf(self, args):
        wl = lingpy.Wordlist(self.raw_dir.joinpath("chinese.tsv").as_posix())
        maxcogid = 0

        args.writer.add_sources()
        args.writer.add_languages(id_factory=lambda l: l["Name"])
        args.writer.add_concepts(
            id_factory=lambda c: slug(c.label, lowercase=False))

        # store list of proto-form to cognate set
        p2c = {}

        for k in wl:
            for row in args.writer.add_lexemes(
                    Language_ID=wl[k, "doculect"],
                    Parameter_ID=slug(wl[k, "concept"], lowercase=False),
                    Value=wl[k, "ipa"],
                    Source="Hamed2006",
                    Cognacy=wl[k, "COGID"],
            ):
                args.writer.add_cognate(lexeme=row,
                                        Cognateset_ID=wl[k, "cogid"],
                                        Source=["Hamed2006", "List2015"])
            maxcogid = max([maxcogid, int(wl[k, "cogid"])])
            p2c[wl[k, "concept"], wl[k, "proto"]] = wl[k, "cogid"]
        idx = max([k for k in wl]) + 1
        for line in lingpy.csv2list(
                self.raw_dir.joinpath("old_chinese.csv").as_posix()):
            for val in line[1].split(", "):
                cogid = p2c.get((line[0], val))
                if not cogid:
                    maxcogid += 1
                    cogid = p2c[line[0], val] = maxcogid
                for row in args.writer.add_lexemes(
                        Language_ID="OldChinese",
                        Parameter_ID=slug(line[0], lowercase=False),
                        Value=val,
                        Source="Hamed2006",
                        Cognacy=p2c.get(val, val),
                ):
                    args.writer.add_cognate(lexeme=row,
                                            Cognateset_ID=cogid,
                                            Source=["Hamed2006", "List2015"])
                idx += 1
Exemple #11
0
def write_map(varieties, outfile):
    languages = lp.csv2list(varieties)
    colors = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a",
            '#040404', '#F6E3CE', '#81F79F', '#8A0808', '#FA58F4', '#0489B1',
            '#088A08']
    points = []
    header = [x.strip() for x in languages[0]]
    nidx = header.index('NAME')
    latidx = header.index('LATITUDE')
    lonidx = header.index('LONGITUDE')
    pinidx = header.index('PINYIN')
    hanidx = header.index('HANZI')
    groupidx = header.index("SUBGROUP")
    pinidx = header.index("PINYIN")
    famidx = header.index('FAMILY')

    groups = sorted(set([line[groupidx] for line in languages[1:]]))
    for line in languages[1:]:
        name = line[nidx]
        pinyin = line[pinidx]
        hanzi = line[hanidx]
        lat, lon = line[latidx], line[lonidx]
        group = line[groupidx]
        family = line[famidx]
        if lat.strip() and lat != '?':
            lat, lon = float(lat), float(lon)
            if lat > 400 or lon > 400:
                raise ValueError("Coords for {0} are wrong.".format(name))
            point = geojson.Point((lon, lat))
            feature = geojson.Feature(geometry=point, 
                    properties = {
                        "Family" : family,
                        "Variety" : name,
                        "Pinyin" : pinyin,
                        "Chinese" : hanzi,
                        "Group" : group,
                        "marker-color" : colors[groups.index(group)]
                        })
            points += [feature]
    with open(outfile, 'w') as f:
        f.write(json.dumps(geojson.FeatureCollection(points)))
Exemple #12
0
def cldf(dataset, concepticon, **kw):
    wl = lp.Wordlist(dataset.raw.joinpath(DSET).as_posix())
    gcode = {x['NAME']: x['GLOTTOCODE'] for x in dataset.languages}
    ccode = {x.english: x.concepticon_id for x in
             dataset.conceptlist.concepts.values()}
    src = getEvoBibAsSource(SOURCE)
    src2 = getEvoBibAsSource('List2015d')

    with CldfDataset((
        'ID',
        'Language_ID',
        'Language_name',
        'Language_iso',
        'Parameter_ID',
        'Parameter_name',
        'Value',
        'Source',
        'Cognacy',
        )
            , dataset) as ds:
        
        ds.sources.add(src, src2)

        # store list of proto-form to cognate set
        p2c = {}

        for k in wl:
            ds.add_row([
                '{0}-{1}'.format(SOURCE, k),
                gcode[wl[k, 'doculect']],
                wl[k, 'doculect'],
                '',
                ccode[wl[k, 'concept']],
                wl[k, 'concept'],
                wl[k, 'ipa'],
                SOURCE,
                wl[k, 'COGID']
            ])
            dataset.cognates += [[
                '{0}-{1}'.format(SOURCE, k),
                ds.name,
                wl[k, 'ipa'],
                '-'.join([slug(wl[k, 'concept']), str(wl[k, 'cogid'])]),
                '', 
                'expert',
                SOURCE,
                '',
                '',
                ''
            ]]
            p2c[wl[k, 'proto']] = wl[k, 'cogid']
        idx = max([k for k in wl]) + 1
        for line in lp.csv2list(dataset.raw.joinpath('old_chinese.csv').as_posix()):
            for val in line[1].split(', '):
                ds.add_row((
                    '{0}-{1}'.format(SOURCE, idx),
                    'sini1245',
                    'Old Chinese',
                    '',
                    ccode[line[0]],
                    line[0],
                    val,
                    SOURCE,
                    p2c.get(val, val)
                ))
                dataset.cognates += [[
                    '{0}-{1}'.format(SOURCE, idx),
                    ds.name,
                    val,
                    '-'.join([slug(line[0]), text_type(p2c.get(val, val))]),
                    '',
                    'expert',
                    SOURCE,
                    '',
                    '',
                    '']]
                idx += 1
Exemple #13
0
    def cmd_install(self, **kw):
        data = json.load(open(self.raw.posix('data.json')))
        wl = lingpy.Wordlist(self.raw.posix('sino-tibetan-raw.tsv'))
        profile = {
            l[0]: l[1]
            for l in lingpy.csv2list(self.raw.posix('profile.tsv'))
        }
        for idx, tokens in pb(wl.iter_rows('tokens'), desc='tokenize'):
            tks = []
            for token in tokens:
                tks += profile.get(
                    token,
                    profile.get(
                        token.split('/')[1] if '/' in token else token,
                        token)).split(' ')
            wl[idx, 'tokens'] = [
                x.strip() for x in tks if x != 'NULL' and x.strip()
            ]

        with self.cldf as ds:
            ds.add_sources(*self.raw.read_bib())
            ds.add_languages()
            for c in self.conceptlist.concepts.values():
                ds.add_concept(ID=c.concepticon_id,
                               TBL_ID=c.attributes['huang_1992_1820'],
                               Name=c.english,
                               Coverage=c.attributes['coverage'],
                               Concepticon_ID=c.concepticon_id)
            concept2id = {
                c.english: c.concepticon_id
                for c in self.conceptlist.concepts.values()
            }
            source_dict, langs_dict = {}, {}
            concept_dict = {}
            for l in self.languages:
                source_dict[l['Name']] = l['Source']
                langs_dict[l['Name']] = l['ID']

            wl.output('tsv',
                      filename=self.raw.posix('sino-tibetan-cleaned'),
                      subset=True,
                      rows={"ID": "not in " + str(data['blacklist'])})

            for k in pb(wl, desc='wl-to-cldf'):
                if wl[k, 'tokens']:
                    row = ds.add_form_with_segments(
                        Language_ID=langs_dict.get(data['taxa'].get(
                            wl[k, 'doculect'], wl[k, 'doculect'])),
                        Parameter_ID=concept2id[wl[k, 'concept']],
                        Value=wl[k, 'entry_in_source'].strip()
                        or ''.join(wl[k, 'tokens']) or wl[k, 'ipa'],
                        Form=wl[k, 'ipa'] or wl[k, 'entry_in_source']
                        or ''.join(wl[k, 'tokens']),
                        Segments=wl[k, 'tokens'],
                        Source=source_dict.get(data['taxa'].get(
                            wl[k, 'doculect'], wl[k, 'doculect'])).split(','),
                        Comment=wl[k, 'note'],
                        Cognacy=wl[k, 'cogid'],
                        Loan=True if wl[k, 'borrowing'].strip() else False)

                    cid = slug(wl[k, 'concept']) + '-' + '{0}'.format(
                        wl[k, 'cogid'])
                    ds.add_cognate(lexeme=row,
                                   Cognateset_ID=cid,
                                   Source='Sagart2018',
                                   Alignment='',
                                   Alignment_Source='')
Exemple #14
0
def download(dataset):
    print(HEADER)
    chars = load_characters()
    charset = []
    for k in chars:
        if chars[k,
                 'source'] == 'Baxter1992' or chars[k,
                                                    'source'] == 'Baxter2014':
            charset += [chars[k, 'character']]
    print('[Loaded Characters]')
    for char in charset:
        _tmp = [
            x[0] for x in lingpy.csv2list(dataset.get_path(
                'raw', 'data-starostin.tsv'),
                                          strip_lines=True) if x
        ]
        if not _tmp:
            out = open(dataset.get_path('raw', 'data-starostin.tsv'), 'w')
            out.write('CHARACTER' + '\t' + '\t'.join(HEADER) + '\n')
            out.close()
        _tmp += [
            x[0] for x in lingpy.csv2list(
                dataset.get_path('raw', 'data-missing.tsv'))
        ]
        if char not in _tmp:
            new_url = URL + parse.quote(char)
            print('[LOADING] ' + char + ' ' + new_url)
            try:
                req = request.urlopen(new_url)
                data = req.read().decode('utf-8')
                found = False
                tmp = {}
                for f in FIELDS:
                    d = re.findall(
                        '<span class="fld">' + f +
                        ':</span>.*?<span class="unicode">(.*?)</span>', data,
                        re.DOTALL)
                    print(f, d)
                    if d:
                        tmp[f] = d[0]
                        found = True
                    else:
                        tmp[f] = ''
                for l in LINKS:
                    d = re.findall(
                        '<span class="fld">' + l +
                        ':</span>.*?<a href="(.*?)"', data, re.DOTALL)
                    print(l, d)
                    if d: tmp[l] = d[0]
                    else: tmp[l] = ''

                if found:
                    print('Found character {0} reading: {1}'.format(
                        char, tmp['Modern .Beijing. reading']))
                    out = open(dataset.get_path('raw', 'data-starostin.tsv'),
                               'a')
                    out.write(char + '\t' + '\t'.join(
                        [tmp[h].strip().replace('\t', '')
                         for h in HEADER]) + '\n')
                    out.close()
                else:
                    print('Problem, ', len(data))
                    out = open(dataset.get_path('raw', 'data-missing.tsv'),
                               'a')
                    out.write(char + '\n')
                    out.close()
            except urllib.error.HTTPError:
                print('[ERROR IN LOADING URL]')
Exemple #15
0
# email    : [email protected]
# created  : 2014-03-11 15:50
# modified : 2014-03-11 15:50
"""
<++>
"""

__author__="Johann-Mattis List"
__date__="2014-03-11"


from lingpy import csv2list
import sqlite3


vals = csv2list('msa_taxa.csv')

conn = sqlite3.connect('../website_new/bdhl.de/data/data.sqlite3')
cursor = conn.cursor()
try:
    cursor.execute('drop table alignments;')
except:
    pass

cursor.execute('create table alignments(id int, file text, dataset text, sequence tex, pid int, seqnum int, uniques int, taxa text);')

for line in vals:

    cursor.execute(
            'insert into alignments values(?,?,?,?,?,?,?,?);',
            tuple(line)
relations = dict(broader='narrower',
                 similar='similar',
                 sameas='sameas',
                 resultof='resultsin',
                 produces='producedby',
                 usedfor='requires',
                 consistsof='',
                 classof='instanceof',
                 intransitiveof='transitiveof',
                 baseof='hasform')
for k, v in list(relations.items()):
    if v and v != k:
        relations[v] = k

# load teh concepticon to get the meta-data
_C = lingpy.csv2list('../concepticondata/concepticon.tsv')
C = {}
for line in _C[1:]:
    tmp = dict(zip([x.lower() for x in _C[0]], line))
    C[line[0]] = tmp

with open('../concepticondata/conceptrelations.tsv') as f:

    for line in f.readlines()[1:]:
        print(line.replace('\t', '-x-'))

        a, _a, b, c, _c = [x.strip() for x in line.split('\t')]

        if a and b and relations[b]:
            G.add_edge(a, c, relation=b)
            G.add_edge(c, a, relation=relations[b])
    broader = 'narrower',
    similar = 'similar',
    sameas = 'sameas',
    resultof = 'resultsin',
    produces = 'producedby',
    usedfor = 'requires',
    consistsof = '',
    classof = 'instanceof',
    intransitiveof = 'transitiveof'
        )
for k,v in list(relations.items()):
    if v and v != k:
        relations[v] = k

# load teh concepticon to get the meta-data
_C = lingpy.csv2list('../concepticondata/concepticon.tsv')
C = {}
for line in _C[1:]:
    tmp = dict(zip([x.lower() for x in _C[0]],line))
    C[line[0]] = tmp

with open('../concepticondata/conceptrelations.tsv') as f:

    for line in f.readlines()[1:]:
        print(line) 
        a,_a,b,c,_c = [x.strip() for x in line.split('\t')]
        
        if a and b and relations[b]:
            G.add_edge(a,c,relation=b)
            G.add_edge(c,a,relation=relations[b])
Exemple #18
0
            "ipa": 'e'
        },
        'ei': {
            "ipa": 'ei'
        },
        'k': {
            "ipa": 'k'
        },
        'p': {
            "ipa": 'p'
        }
    }
    out = segmentize('khaetphaeit', segments, debug=False, column='ipa')
    print(' '.join(out))

    segments = {
        k[0]: {
            'ipa': k[1],
            'structure': k[2]
        }
        for k in csv2list(_path('chinese.tsv'))
    }

    for word in [
            'khap55', 'khuang5', 'kai', 'kiAng', 'thang', 'pfhang35',
            'pfing44fu24', 'mao35tse35doŋ51'
    ]:
        print(' '.join(segmentize(word, segments, column='ipa')))
        print(' '.join(segmentize(word, segments, column='structure')))
        print(' ')