def wikibooks():

    with open('wikibooks.txt') as f:
        data = f.readlines()
    out = []
    gsr = defaultdict(dict)
    for i, line in enumerate(data):
        line = strip_brackets(line.strip().replace('\t', ' '),
                              brackets={'(': ')'})
        if line.startswith('*'):
            if not line[1] == ' ':
                line = line.replace('*', '* ')
            elms = line.split(' ')
            if elms and len(elms) > 1:
                kgsc = elms[1].split('/')
                if len(kgsc) == 1:
                    schuessler = ''
                    karlgren = kgsc[0]
                elif len(kgsc) == 2:
                    karlgren = kgsc[1]
                    schuessler = kgsc[0]
                else:
                    print('[ERROR:schuessler/karlgren] {0}'.format(line))

                try:
                    char = elms[2].split('|')[-1][0]
                except IndexError:
                    print('[ERROR:character] {0}'.format(line))
                    char = ''

                mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]]
                if len(karlgren) not in [4, 5, 6]:
                    print('[ERROR:karlgren] {0}'.format(line, karlgren))
                elif not sinopy.is_chinese(char):
                    print('[ERROR:char] {0}'.format(line))
                elif char:
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                    out += [(char, pinyin, 'Old_Chinese', karlgren[:4],
                             karlgren, '', 'Karlgren1954')]
                    for reading in mch:
                        out += [(char, pinyin, 'Middle_Chinese', '', karlgren,
                                 reading, 'Wikibooks2016a')]
                        gsr[char][reading] = [pinyin, reading, karlgren]

    with open('karlgren.tsv', 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n'
        )
        for i, line in enumerate(out):
            f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')

    return gsr
Beispiel #2
0
def prepare(dataset):

    with open(dataset.get_path('raw', 'wikibooks.txt')) as f:
        data = f.readlines()
    out = []
    for i, line in enumerate(data):
        line = line.strip().replace('\t', ' ')
        if line.startswith('*'):
            if not line[1] == ' ':
                line = line.replace('*', '* ')
            elms = line.split(' ')
            if elms and len(elms) > 1:
                kgsc = elms[1].split('/')
                if len(kgsc) == 1:
                    schuessler = kgsc[0]
                    karlgren = ''
                elif len(kgsc) == 2:
                    karlgren = kgsc[1]
                    schuessler = kgsc[0]
                else:
                    print('[ERROR:schuessler/karlgren] {0}'.format(line))

                try:
                    char = elms[2].split('|')[-1][0]
                except IndexError:
                    print('[ERROR:character] {0}'.format(line))
                    char = ''

                mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]]
                #if len(karlgren) not in [4, 5, 6]:
                #    print('[ERROR:karlgren] {0}'.format(line, karlgren))
                if not sinopy.is_chinese(char):
                    print('[ERROR:char] {0}'.format(line))
                elif char:
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                    out += [(char, pinyin, 'Old_Chinese', schuessler, karlgren,
                             '', 'Schuessler2009')]
                    for reading in mch:
                        out += [(char, pinyin, 'Middle_Chinese', '', karlgren,
                                 reading, 'Wikibooks2016b')]
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n'
        )
        for i, line in enumerate(out):
            f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')
Beispiel #3
0
def prepare(dataset):
    with UnicodeReader(dataset.get_path('raw', 'O_shijing.tsv'), delimiter='\t') as reader:
        data = list(reader)
    header = [h.lower() for h in data[0]]
    C = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'SHIJING_NAME',
        'SHJING_NUMBER', 'STANZA', 'VERSE', 'RHYME_CLASS', 'POSITION', 'TEXT',
        'ORDER', 'SOURCE'
        )]
    for line in data[1:]:
        tmp = dict([(a, b.strip()) for a, b in zip(header, line)])
        
        poem = '·'.join((tmp['block'], tmp['chapter'], tmp['title']))
        poem_number = tmp['number']
        stanza = tmp['stanza']
        verse = tmp['verse']
        char = tmp['character']
        
        # get the position
        pos = str(tmp['raw_section'].index(char))
        text = tmp['raw_section'] + tmp['endchar']
        rhymeid = tmp['rhyme']
        pinyin = sinopy.pinyin(char)
        order = tmp['section_number']
        if '?' in pinyin or sinopy.is_chinese(pinyin):
            pinyin = ''

        C += [[tmp['id'], char, pinyin, 'Old_Chinese', poem, poem_number, stanza,
            verse, rhymeid, pos, text, order, 'Baxter1992']]

    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join(line)+'\n')
Beispiel #4
0
 def pinyin(self, chars):
     py = []
     for char in chars:
         p = sinopy.pinyin(char)
         if '?' in p or sinopy.is_chinese(p):
             p = ''
         py += [p]
     return ' '.join(py)
Beispiel #5
0
def character_list():
    sources = get_sources('characters.tsv')
    master = defaultdict(list)

    occs = defaultdict(list)
    doculects = set()
    for source in sources:
        print('[preparing]', source)
        ds = Dataset(source)
        for char in ds.characters.rows:
            occs[char] += [source]
            tmp = ds.characters.get_dict(row=char)
            readings = []
            for t, chars in tmp.items():
                for c in chars:
                    _data = (
                        t,
                        c,
                        source,
                    )
                    for h in [
                            'reading', 'fanqie', 'phonetic_class',
                            'semantic_class', 'rhyme_class',
                            'wordfamily_class', 'source'
                    ]:
                        if ds.characters[c, h]:
                            _data += (ds.characters[c, h], )
                        else:
                            _data += ('', )

                    readings += [_data]
            for reading in readings:
                master[char].append(reading)
                doculects.add(t)

    table, idx = [], 1
    for i, (char, vals) in enumerate(master.items()):
        if len(occs[char]
               ) == 2 and 'Guangyun' in occs[char] and 'Shuowen' in occs[char]:
            pass
        elif len(occs[char]) == 1 and ('Guangyun' in occs[char]
                                       or 'Shuowen' in occs[char]):
            pass
        else:
            pinyin = sinopy.pinyin(char)
            if sinopy.is_chinese(pinyin) or '?' in pinyin or '!' in pinyin:
                pinyin = ''
            for t, crossref, dataset, reading, fq, pc, sc, rc, wf, src in vals:
                table += [(idx, char, pinyin, t, reading, fq, pc, sc, rc, wf,
                           src, dataset, crossref)]
                idx += 1
    with open(cddb_path('datasets', 'characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tREADING\tFANQIE\tPHONETIC_CLASS\tSEMANTIC_CLASS\tRHYME_CLASS\tWORDFAMILY_CLASS\tSOURCE\tDATASET\tDATASET_ID\n'
        )
        for line in table:
            f.write('\t'.join([str(x) for x in line]) + '\n')
Beispiel #6
0
def aline(line, warnings):

    comment = ''
    problem = False
    problems = ['']
    rhymewords = []
    reconstruction = ''
    previous = ''
    inbrackets, incomment, outbracket = False, False, False
    stripped_line = ''
    for char in line:
        if char == ' ':
            pass
        elif char == '<':
            problem = True
        elif char == '>':
            problem = False
            problems += ['']
        elif char == '[':
            inbrackets = True
        elif inbrackets and char != ']':
            reconstruction += char
        elif char == ']':
            inbrackets=False
            if rhymewords:
                rhymewords[-1] += [reconstruction]
            else:
                rhymewords += [['???', previous, reconstruction]]
                warnings += 1
            reconstruction = ''
            outbracket = True
        elif char in '((':
            incomment = True
        elif char in '))':
            incomment = False
        elif incomment:
            comment += char
        else:
            if problem:
                problems[-1] += char
            elif sinopy.is_chinese(char):
                if outbracket:
                    rhymewords[-1][0] = char
                    outbracket = False
                previous = char
                stripped_line += char
            elif char in '?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz':
                rhymewords += [['¿', char]]
    return stripped_line, rhymewords, comment, warnings, ','.join(problems)
Beispiel #7
0
    def gbk_and_big5(self, chars):
        out1, out2 = '', ''
        for char in chars:
            if sinopy.is_chinese(char):
                idx1, idx2 = sinopy._cd.GBK.find(char), sinopy._cd.BIG5.find(
                    char)
                if idx1 > -1:
                    out1 += char
                    out2 += sinopy._cd.BIG5[idx1]
                elif idx2 > -1:
                    out1 += sinopy._cd.GBK[idx2]
                    out2 += char
                else:
                    out1 += char
                    out2 += char

        return out1, out2
Beispiel #8
0
def prepare(dataset):

    with open(dataset.get_path('raw', 'sbgy.xml')) as f:
        data = f.readlines()
    
    D = [('ID', 'CHARACTER_ID', 'CHARACTER', 'DOCULECT', 'PINYIN', 'READING', 'FANQIE',
        'RHYME_ID', 'RHYME_NUMBER', 'VOLUME', 'NOTE', 'SOURCE')]
    volume, rhyme_id, rhyme, ipa, fanqie, text  = '', '', '', '', '', ''
    idx = 1
    for line in data:
        if '<volume id' in line:
            volume = re.findall('id="(.*?)"', line)[0]
            
        if 'rhyme id="' in line:
            rhyme_id = re.findall('id="(.*?)"', line)[0]
        
        if rhyme_id:
            
            if 'rhyme_num' in line:
                rhyme = re.findall('>(.*?)<', line)[0]

            if 'voice_part ipa' in line:
                ipa = re.findall('ipa="(.*?)"', line)[0]
            

            if '/word_head' in line and text:
                text = text.replace('\n', ' ').strip()
                charid, char = re.findall(
                        '<word_head id="(.*?)">(.*?)<',
                        text)[0]
                note = re.findall('<note>(.*?)</note>', text)
                note = note[0] if note else ''

                if 'fanqie' in text:
                    fanqie = re.findall(
                            '<fanqie>(.*?)</fanqie>',
                            text)[0]
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                if sinopy.is_chinese(char.strip()):
                    D += [(str(idx),
                        charid, char.strip(), 'Middle_Chinese', pinyin, ipa,
                        fanqie.strip(),
                        rhyme_id.strip(), rhyme.strip(), volume, note.strip(),
                        'Zhou1938')]
                    idx += 1
                else:
                    print('[ERROR]', char, sinopy.is_chinese(char), ipa, fanqie)
                text = ''
            
            if text:
                text += line
            
            if 'word_head id' in line:
                text = line



    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in D:
            f.write('\t'.join([l.replace('\t', '') for l in line])+'\n')
Beispiel #9
0
def prepare(dataset):

    img_url = "http://kanji-database.sourceforge.net/dict/swjz/swjz-img/"

    with open(dataset.get_path('raw', 'swjz.xml')) as f:
        data = f.readlines()

    wordheads = {}
    blocks, chapter, idx = {}, '', 1
    blocks[idx] = ''
    for i, line in enumerate(data):
        if '<chaptertitle id' in line:
            chapter = re.findall('id="(.*?)"', line)[0]

        if blocks[idx]:
            blocks[idx]['text'] += '\n' + line.strip()

        if '<shuowen>' in line:
            blocks[idx] = dict(text=line.strip(), chapter=chapter)

        if '</shuowen>' in line:
            idx += 1
            blocks[idx] = False

    for i, block in [(a, b) for a, b in blocks.items() if b]:

        for line in block['text'].split('\n'):
            if 'wordhead' in line:
                wid, img, char = re.findall('id="(.*?)" img="(.*?)">(.*?)<',
                                            line)[0]
                pinyin = sinopy.pinyin(char)
                if '?' in pinyin or sinopy.is_chinese(pinyin):
                    pinyin = ''
                wordheads[wid] = dict(explanations=[],
                                      char=char,
                                      notes=[],
                                      img=img,
                                      block=i + 1,
                                      pinyin=pinyin,
                                      doculect='Old_Chinese',
                                      chapter=block['chapter'])
            if 'explanation>' in line:
                wordheads[wid]['explanations'] += [
                    re.findall('>(.*?)<', line)[0]
                ]
                structure = re.findall('从(.)。(.)聲', line)
                if structure:
                    wordheads[wid]['radical'] = structure[0][0]
                    wordheads[wid]['phonetic'] = structure[0][1]
            if 'duan_note>' in line:
                wordheads[wid]['notes'] += [re.findall('>(.*?)<', line)[0]]
                fq = re.findall('>(..)切。', line)
                if fq and sinopy.is_chinese(fq[0]):
                    wordheads[wid]['fanqie'] = fq[0]
                bu = re.findall('(.)部。', line)
                if bu and sinopy.is_chinese(bu):
                    wordheads[wid]['rhyme'] = bu[0]

    with open(dataset.get_path('characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER_ID\tGROUP\tCHARACTER\tPINYIN\tDOCULECT\tSEMANTIC_CLASS\tPHONETIC_CLASS\tFANQIE\tRHYME_CLASS\tCHAPTER\tIMAGE\tTEXT\tCOMMENT\n'
        )
        idx = 1
        for k, vals in sorted(wordheads.items(), key=lambda x: x[0]):
            addons = []
            for key in [
                    'block', 'char', 'pinyin', 'doculect', 'radical',
                    'phonetic', 'fanqie', 'rhyme', 'chapter', 'img',
                    'explanation', 'notes'
            ]:
                val = vals.get(key, '')
                if isinstance(val, list):
                    val = ' / '.join(val)
                addons += [str(val).replace('\t', ' ')]
            f.write(str(idx) + '\t' + k + '\t' + '\t'.join(addons) + '\n')
            idx += 1
Beispiel #10
0
def prepare_old2(ds):

    converter = {
        '豬肉': '肉',
        '豬艤': '艤',
        '! □水': '口水',
        '! 一□水': '一口水',
        '星〔星兒〕': '星',
        "一串兒葡萄": "一串葡萄",
        "一小片兒草": "一小片草",
        "一串兒葡萄": "一串葡萄",
        "一抓兒葡萄": "一抓葡萄",
        "手套兒": "手套",
        "茄兒如": "茄如",
        "前兒日": "前日",
        "前兒個": "前個",
        "明兒個": "明個",
        "明兒個": "明個",
        "今兒個": "今個",
        "今兒日": "今日",
        "黃花兒魚": "黃花魚",
        "大前兒個": "大前個",
        "大前兒日": "大前日",
        "大後兒個": "大後個",
    }
    bad_list = []
    visited = []
    inv = ds.sounds
    words = Wordlist(ds.raw('words-2017-06-16.tsv'))
    weilist = []
    pids = {}
    pidx = 1
    characters, partialcogs = {}, {}
    blacklist = []
    for idx, bid, segments, chars, note in iter_rows(words, 'beida_id',
                                                     'segments', 'hanzi',
                                                     'note'):
        if 'ignore' in note:
            blacklist += [idx]
        else:
            ochars = chars
            chars = converter.get(chars, chars)
            chars = re.sub('〔[^〕]+〕', '', chars)
            chars = re.sub('<[^>]+>', '', chars)
            chars = chars.replace('□', '囗')
            chars = chars.replace('?', '')
            chars = ''.join(
                [c for c in chars.split(',')[0] if sp.is_chinese(c)])
            tks = tokens2morphemes(segments)
            partials = []
            if len(tks) == len(chars):
                for char in chars:
                    if char in pids and char != '囗':
                        partials += [str(pids[char])]
                    else:
                        pids[char] = pidx
                        pidx += 1
                        partials += [str(pids[char])]
            else:
                if chars.endswith('兒'):
                    if len(chars) - 1 == len(tks):
                        for char in chars[:-1]:
                            if char in pids and char != '囗':
                                partials += [str(pids[char])]
                            else:
                                pids[char] = pidx
                                pidx += 1
                                partials += [str(pids[char])]
                    else:
                        for tk in tks:
                            partials += [str(pidx)]
                            pidx += 1
                        bad_list += [idx]
                        print(len(bad_list), chars, len(tks), bid)
                elif not chars:
                    weilist += [idx]
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    chars = '?' + chars
                elif '囗' in chars:
                    weilist += [idx]
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    chars = '!' + chars
                else:
                    for tk in tks:
                        partials += [str(pidx)]
                        pidx += 1
                    bad_list += [idx]
                    print(len(bad_list), ochars, '|', '\t|', chars, len(tks),
                          bid)
                    chars = ':' + chars
            characters[idx] = chars
            partialcogs[idx] = ' '.join(partials)
    print(len(weilist))
    words.output('tsv',
                 filename=ds.raw('words.tmp'),
                 subset=True,
                 rows=dict(ID='not in ' + str(blacklist)))
    words = Wordlist(ds.raw('words.tmp.tsv'))
    words.add_entries('benzi', characters, lambda x: x)
    words.add_entries('cogids', partialcogs, lambda x: x)
    ds.write_wordlist(words)
Beispiel #11
0
def prepare(ds):

    convert = {
        k: v
        for k, v in [('ʻ', 'ʻ'), ('‘', '‘'), ('‘', '‘'), ("'", '‘'), (
            '"',
            '""'), ('〜', '~'), ('A', 'ᴀ'), ('□', '口'), ('口', '口'), (
                'Ã', 'ᴀ̃'), ('E', 'ᴇ'), ('Ẽ', 'ᴇ̃'), ('ʔd', 'ʔd'), (
                    'ʔt',
                    'ʔt'), ('ʔp',
                            'ʔp'), ('ʔb',
                                    'ʔb'), ('ʔg',
                                            'ʔg'), ('ʔk',
                                                    'ʔk'), ('I',
                                                            'ɪ'), ('Ɣ', 'ɣ')]
    }

    translate = {'中古音韵': '中古汉语'}
    data = csv2list(ds.raw('Hou-2004-characters.corrected.tsv'))
    headers = csv2list(ds.raw('headers.txt'))
    chin2cddb = {x['hanzi']: y for y, x in ds.languages.items()}
    header = {}
    for line in headers:
        idx, chars = line[0].split(' ')
        lng, srt = ds.gbk_and_big5(chars)
        char = lng[0]
        pinyin = ds.pinyin(char)
        header[line[0][1:-1]] = [char, lng, srt, pinyin]
    D = {
        0: [
            'doculect', 'character', 'pinyin', 'value', 'segments',
            'structure', 'cognate_class'
        ]
    }
    idx = 1
    prf = defaultdict(lambda: defaultdict(int))
    for line in data:
        if len(line) == 3:
            a, b, c = line
            d = []
        elif len(line) == 4:
            a, b, c, d = line
        else:
            a = False
        if a:
            doculect = chin2cddb[translate.get(a, a)]
            cogid = b[1:-1]
            txt = ''.join([
                convert.get(x, x) for x in c
                if x not in '\t \n\r"' and not sinopy.is_chinese(x)
            ])
            if doculect != 'Middle_Chinese':
                for r in slice_word(
                        ipa2tokens(txt,
                                   merge_vowels=False,
                                   semi_diacritics="ɕ‘'ʑsz'ʂʐʃʒf",
                                   expand_nasals=True)):
                    if '~' in r:
                        pass
                    else:
                        tks = ''.join(r)
                        i, f, t = ds.split_initial_final(tks)
                        if f and t:
                            if i:
                                prf[i, 'i'][doculect] += 1
                            prf[f, 'f'][doculect] += 1
                            prf[t, 't'][doculect] += 1
                        elif f:
                            if i: prf[i, 'i'][doculect] += 1
                            prf[f, 'f'][doculect] += 1
                        else:
                            prf[i + f + t, 's'][doculect] += 1
    ds.write_profile(ds.raw('hou-characters.prf'), prf)
Beispiel #12
0
 def chinese(self, string):
     return sinopy.is_chinese(string)
Beispiel #13
0
def prepare(dataset):
    with open(dataset.get_path('raw', '__private__schuessler.txt')) as f:
        data = f.readlines()

    D = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'WORDFAMILY_CLASS', 'GLOSS',
          'READING', 'VARIANT_CLASS', 'SOURCE')]
    idf = ''
    idx = 1
    for line in data:
        if line.startswith('ENTRY'):
            if idf and sinopy.is_chinese(char.strip()):
                if len(char) > 1 and not '-' in pinyin:
                    variant = char[0]
                    chars = list(char)
                else:
                    chars = [char]
                    variant = ''
                for char in chars:
                    if mch:
                        D += [(idx, char, pinyin, 'Middle_Chinese', '', gloss,
                               mch, variant, 'Schuessler2007')]
                        idx += 1
                    if ocb:
                        D += [(idx, char, pinyin, 'Old_Chinese', '', gloss,
                               ocb, variant, 'Baxter1992')]
                        idx += 1
                    if ocm:
                        D += [(idx, char, pinyin, 'Old_Chinese', anc, gloss,
                               ocm, variant, 'Schuessler2007')]
                        idx += 1
                    if lhc:
                        D += [(idx, char, pinyin, 'Late_Han_Chinese', '',
                               gloss, lhc, variant, 'Schuessler2007')]
                        idx += 1

            idf = line[6:].strip()
            gloss, pinyin, anc, char, mch, ocb, ocm, lhc = ('', '', '', '', '',
                                                            '', '', '')
        if line.startswith('HEAD'):
            if '⪤' in line:
                anc, line = line[5:].split('⪤')
                anc = '⪤ ' + anc
            elif '~' in line:
                anc, line = line[5:].split('~')
                anc = '~ ' + anc
            elif '=' in line:
                anc, line = line[5:].split('=')
                anc = '= ' + anc
            else:
                anc, line = '', line[5:]
            line = line.strip()
            anc = anc.strip()
            if line.count(' ') == 1:
                pinyin, char = line.split(' ')
            else:
                idf = ''
                print('[Problem]: {0}'.format(line))
        if line.startswith('LH'):
            lhc = line[14:].strip()
        if line.startswith('GLOSS'):
            if gloss:
                gloss += '/' + line[6:].strip()
            else:
                gloss = line[6:].strip()
        if line.startswith('MC'):
            mch = line[18:].strip()
        if line.startswith('OCB'):
            ocb = line[25:].strip()
        if line.startswith('OCM'):
            ocm = line[26:].strip()

    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for i, line in enumerate(D):
            f.write('\t'.join([str(x) for x in line]) + '\n')
Beispiel #14
0
def prepare(dataset):

    # correct wrong pinyins in sinopy
    pinyin = {
        "虱": "shī",
        "咯": "gē",
        "強": "qiáng",
        "哩": "lǐ",
        "喏": "nuò",
        "鳧": "fú",
        "伲": "nǐ",
        "黃": "huáng",
        "哋": "dì",
        "阿": "ā",
        "卵": "luǎn",
        "說": "shuō",
        "喙": "huì",
        "頸": "jǐng",
        "唔": "wú}",
        "雞": "jī",
        "黒": "hēi",
        "哪": "nǎ",
        "麼": "me",
        "蔃": "qiáng",
        "葷": "hūn",
        "鳥": "niǎo}",
        "舌": "huà",
        "吃": "chī",
        "膘": "biǎo}",
        "綠": "lǜ",
        "羽": "yǔ",
        "們": "men",
        "焦": "jiāo",
        "腳": "jiǎo",
        "乜": "miē",
        "即": "jí",
        "佬": "lǎo"
    }

    wl = Wordlist(dataset.get_path('raw', 'D_wang-2006.tsv'))
    concepts = dict([
        (x.english, x.concepticon_id)
        for x in Concepticon().conceptlists['Wang-2006-200'].concepts.values()
    ])
    D = {}
    och = csv2list(dataset.get_path('raw', 'D_old_chinese.csv'))
    nidx = max([k for k in wl]) + 1

    wl.add_entries('concepticon_id', 'concept', lambda x: concepts[x])
    wl.add_entries('doculect_in_source', 'doculect', lambda x: x)

    for k in wl:
        doculect = wl[k, 'doculect'].replace('_B', '')
        D[k] = [
            wl[k, h] for h in [
                'doculect', 'doculect_in_source', 'concept', 'concepticon_id',
                'ipa', 'partial'
            ]
        ]

    D[0] = [
        'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value',
        'characters'
    ]
    for a, chars in och:
        for char in chars.split(','):
            if char != '-':
                D[nidx] = [
                    'Old_Chinese', 'Old_Chinese', a, concepts[a], char, char
                ]
                nidx += 1

    wl2 = Wordlist(D)
    renumber_partial(wl2, name='cogids', partial_cognates='characters')
    wl2.output('tsv',
               filename=dataset.get_path('words'),
               ignore='all',
               prettify=False)

    # we also write the characters
    C = [[
        'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT',
        'DOCULECT', 'POSITION'
    ]]
    idx = 1
    errors = {}
    for k in wl2:
        concept = wl2[k, 'concept']
        doculect = wl2[k, 'doculect']
        chars = sinopy.gbk2big5(wl2[k, 'value'])
        cogids = wl2[k, 'cogids'].split(' ')
        for i, (char, cogid) in enumerate(zip(chars, cogids)):
            if sinopy.is_chinese(char):
                py = sinopy.pinyin(char)
                py = pinyin.get(char, py)
                if '?' in py or '{' in py:
                    if char in errors:
                        pass
                    else:
                        errors[char] = py
                C += [[idx, char, py, cogid, k, concept, doculect, i]]
                idx += 1
    for k, v in errors.items():
        print('"' + k + '" : "' + v + '",')
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join([str(x) for x in line]) + '\n')