Example #1
0
def prepare(dataset):
    with UnicodeReader(dataset.get_path('raw', 'O_shijing.tsv'), delimiter='\t') as reader:
        data = list(reader)
    header = [h.lower() for h in data[0]]
    C = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'SHIJING_NAME',
        'SHJING_NUMBER', 'STANZA', 'VERSE', 'RHYME_CLASS', 'POSITION', 'TEXT',
        'ORDER', 'SOURCE'
        )]
    for line in data[1:]:
        tmp = dict([(a, b.strip()) for a, b in zip(header, line)])
        
        poem = '·'.join((tmp['block'], tmp['chapter'], tmp['title']))
        poem_number = tmp['number']
        stanza = tmp['stanza']
        verse = tmp['verse']
        char = tmp['character']
        
        # get the position
        pos = str(tmp['raw_section'].index(char))
        text = tmp['raw_section'] + tmp['endchar']
        rhymeid = tmp['rhyme']
        pinyin = sinopy.pinyin(char)
        order = tmp['section_number']
        if '?' in pinyin or sinopy.is_chinese(pinyin):
            pinyin = ''

        C += [[tmp['id'], char, pinyin, 'Old_Chinese', poem, poem_number, stanza,
            verse, rhymeid, pos, text, order, 'Baxter1992']]

    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join(line)+'\n')
Example #2
0
def main():

    debug = False
    if 'debug' in argv or '--debug' in argv:
        debug = True
    if 'pinyin' in argv:
        py = sinopy.pinyin(argv[argv.index('pinyin') + 1])
        print(py)
    if 'profile' in argv:
        if '--cldf' in argv:
            wl = Wordlist.from_cldf(argv[argv.index('profile') + 1],
                                    col='language_id',
                                    row='parameter_id')
            wl.add_entries('doculect', 'language_name', lambda x: x)
        else:
            wl = Wordlist(argv[argv.index('profile') + 1])
        column = 'ipa'
        language = None
        filename = 'orthography.tsv'
        if '--column' in argv:
            column = argv[argv.index('--column') + 1]
        if '--language' in argv:
            language = argv[argv.index('--language') + 1]
        if '-l' in argv:
            language = argv[argv.index('-l') + 1]
        if '-o' in argv:
            filename = argv[argv.index('-o') + 1]
        if '--filename' in argv:
            filename = argv[argv.index('--filename') + 1]

        segments.write_structure_profile(wl,
                                         column=column,
                                         filename=filename,
                                         debug=debug,
                                         language=language)
Example #3
0
 def pinyin(self, chars):
     py = []
     for char in chars:
         p = sinopy.pinyin(char)
         if '?' in p or sinopy.is_chinese(p):
             p = ''
         py += [p]
     return ' '.join(py)
Example #4
0
def character_list():
    sources = get_sources('characters.tsv')
    master = defaultdict(list)

    occs = defaultdict(list)
    doculects = set()
    for source in sources:
        print('[preparing]', source)
        ds = Dataset(source)
        for char in ds.characters.rows:
            occs[char] += [source]
            tmp = ds.characters.get_dict(row=char)
            readings = []
            for t, chars in tmp.items():
                for c in chars:
                    _data = (
                        t,
                        c,
                        source,
                    )
                    for h in [
                            'reading', 'fanqie', 'phonetic_class',
                            'semantic_class', 'rhyme_class',
                            'wordfamily_class', 'source'
                    ]:
                        if ds.characters[c, h]:
                            _data += (ds.characters[c, h], )
                        else:
                            _data += ('', )

                    readings += [_data]
            for reading in readings:
                master[char].append(reading)
                doculects.add(t)

    table, idx = [], 1
    for i, (char, vals) in enumerate(master.items()):
        if len(occs[char]
               ) == 2 and 'Guangyun' in occs[char] and 'Shuowen' in occs[char]:
            pass
        elif len(occs[char]) == 1 and ('Guangyun' in occs[char]
                                       or 'Shuowen' in occs[char]):
            pass
        else:
            pinyin = sinopy.pinyin(char)
            if sinopy.is_chinese(pinyin) or '?' in pinyin or '!' in pinyin:
                pinyin = ''
            for t, crossref, dataset, reading, fq, pc, sc, rc, wf, src in vals:
                table += [(idx, char, pinyin, t, reading, fq, pc, sc, rc, wf,
                           src, dataset, crossref)]
                idx += 1
    with open(cddb_path('datasets', 'characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tREADING\tFANQIE\tPHONETIC_CLASS\tSEMANTIC_CLASS\tRHYME_CLASS\tWORDFAMILY_CLASS\tSOURCE\tDATASET\tDATASET_ID\n'
        )
        for line in table:
            f.write('\t'.join([str(x) for x in line]) + '\n')
def wikibooks():

    with open('wikibooks.txt') as f:
        data = f.readlines()
    out = []
    gsr = defaultdict(dict)
    for i, line in enumerate(data):
        line = strip_brackets(line.strip().replace('\t', ' '),
                              brackets={'(': ')'})
        if line.startswith('*'):
            if not line[1] == ' ':
                line = line.replace('*', '* ')
            elms = line.split(' ')
            if elms and len(elms) > 1:
                kgsc = elms[1].split('/')
                if len(kgsc) == 1:
                    schuessler = ''
                    karlgren = kgsc[0]
                elif len(kgsc) == 2:
                    karlgren = kgsc[1]
                    schuessler = kgsc[0]
                else:
                    print('[ERROR:schuessler/karlgren] {0}'.format(line))

                try:
                    char = elms[2].split('|')[-1][0]
                except IndexError:
                    print('[ERROR:character] {0}'.format(line))
                    char = ''

                mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]]
                if len(karlgren) not in [4, 5, 6]:
                    print('[ERROR:karlgren] {0}'.format(line, karlgren))
                elif not sinopy.is_chinese(char):
                    print('[ERROR:char] {0}'.format(line))
                elif char:
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                    out += [(char, pinyin, 'Old_Chinese', karlgren[:4],
                             karlgren, '', 'Karlgren1954')]
                    for reading in mch:
                        out += [(char, pinyin, 'Middle_Chinese', '', karlgren,
                                 reading, 'Wikibooks2016a')]
                        gsr[char][reading] = [pinyin, reading, karlgren]

    with open('karlgren.tsv', 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n'
        )
        for i, line in enumerate(out):
            f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')

    return gsr
Example #6
0
def prepare(dataset):

    with open(dataset.get_path('raw', 'wikibooks.txt')) as f:
        data = f.readlines()
    out = []
    for i, line in enumerate(data):
        line = line.strip().replace('\t', ' ')
        if line.startswith('*'):
            if not line[1] == ' ':
                line = line.replace('*', '* ')
            elms = line.split(' ')
            if elms and len(elms) > 1:
                kgsc = elms[1].split('/')
                if len(kgsc) == 1:
                    schuessler = kgsc[0]
                    karlgren = ''
                elif len(kgsc) == 2:
                    karlgren = kgsc[1]
                    schuessler = kgsc[0]
                else:
                    print('[ERROR:schuessler/karlgren] {0}'.format(line))

                try:
                    char = elms[2].split('|')[-1][0]
                except IndexError:
                    print('[ERROR:character] {0}'.format(line))
                    char = ''

                mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]]
                #if len(karlgren) not in [4, 5, 6]:
                #    print('[ERROR:karlgren] {0}'.format(line, karlgren))
                if not sinopy.is_chinese(char):
                    print('[ERROR:char] {0}'.format(line))
                elif char:
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                    out += [(char, pinyin, 'Old_Chinese', schuessler, karlgren,
                             '', 'Schuessler2009')]
                    for reading in mch:
                        out += [(char, pinyin, 'Middle_Chinese', '', karlgren,
                                 reading, 'Wikibooks2016b')]
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n'
        )
        for i, line in enumerate(out):
            f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')
Example #7
0
def prepare(dataset):

    with open(dataset.get_path('raw', 'sbgy.xml')) as f:
        data = f.readlines()
    
    D = [('ID', 'CHARACTER_ID', 'CHARACTER', 'DOCULECT', 'PINYIN', 'READING', 'FANQIE',
        'RHYME_ID', 'RHYME_NUMBER', 'VOLUME', 'NOTE', 'SOURCE')]
    volume, rhyme_id, rhyme, ipa, fanqie, text  = '', '', '', '', '', ''
    idx = 1
    for line in data:
        if '<volume id' in line:
            volume = re.findall('id="(.*?)"', line)[0]
            
        if 'rhyme id="' in line:
            rhyme_id = re.findall('id="(.*?)"', line)[0]
        
        if rhyme_id:
            
            if 'rhyme_num' in line:
                rhyme = re.findall('>(.*?)<', line)[0]

            if 'voice_part ipa' in line:
                ipa = re.findall('ipa="(.*?)"', line)[0]
            

            if '/word_head' in line and text:
                text = text.replace('\n', ' ').strip()
                charid, char = re.findall(
                        '<word_head id="(.*?)">(.*?)<',
                        text)[0]
                note = re.findall('<note>(.*?)</note>', text)
                note = note[0] if note else ''

                if 'fanqie' in text:
                    fanqie = re.findall(
                            '<fanqie>(.*?)</fanqie>',
                            text)[0]
                    pinyin = sinopy.pinyin(char)
                    if '?' in pinyin or sinopy.is_chinese(pinyin):
                        pinyin = ''
                if sinopy.is_chinese(char.strip()):
                    D += [(str(idx),
                        charid, char.strip(), 'Middle_Chinese', pinyin, ipa,
                        fanqie.strip(),
                        rhyme_id.strip(), rhyme.strip(), volume, note.strip(),
                        'Zhou1938')]
                    idx += 1
                else:
                    print('[ERROR]', char, sinopy.is_chinese(char), ipa, fanqie)
                text = ''
            
            if text:
                text += line
            
            if 'word_head id' in line:
                text = line



    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in D:
            f.write('\t'.join([l.replace('\t', '') for l in line])+'\n')
Example #8
0
def prepare(dataset):

    img_url = "http://kanji-database.sourceforge.net/dict/swjz/swjz-img/"

    with open(dataset.get_path('raw', 'swjz.xml')) as f:
        data = f.readlines()

    wordheads = {}
    blocks, chapter, idx = {}, '', 1
    blocks[idx] = ''
    for i, line in enumerate(data):
        if '<chaptertitle id' in line:
            chapter = re.findall('id="(.*?)"', line)[0]

        if blocks[idx]:
            blocks[idx]['text'] += '\n' + line.strip()

        if '<shuowen>' in line:
            blocks[idx] = dict(text=line.strip(), chapter=chapter)

        if '</shuowen>' in line:
            idx += 1
            blocks[idx] = False

    for i, block in [(a, b) for a, b in blocks.items() if b]:

        for line in block['text'].split('\n'):
            if 'wordhead' in line:
                wid, img, char = re.findall('id="(.*?)" img="(.*?)">(.*?)<',
                                            line)[0]
                pinyin = sinopy.pinyin(char)
                if '?' in pinyin or sinopy.is_chinese(pinyin):
                    pinyin = ''
                wordheads[wid] = dict(explanations=[],
                                      char=char,
                                      notes=[],
                                      img=img,
                                      block=i + 1,
                                      pinyin=pinyin,
                                      doculect='Old_Chinese',
                                      chapter=block['chapter'])
            if 'explanation>' in line:
                wordheads[wid]['explanations'] += [
                    re.findall('>(.*?)<', line)[0]
                ]
                structure = re.findall('从(.)。(.)聲', line)
                if structure:
                    wordheads[wid]['radical'] = structure[0][0]
                    wordheads[wid]['phonetic'] = structure[0][1]
            if 'duan_note>' in line:
                wordheads[wid]['notes'] += [re.findall('>(.*?)<', line)[0]]
                fq = re.findall('>(..)切。', line)
                if fq and sinopy.is_chinese(fq[0]):
                    wordheads[wid]['fanqie'] = fq[0]
                bu = re.findall('(.)部。', line)
                if bu and sinopy.is_chinese(bu):
                    wordheads[wid]['rhyme'] = bu[0]

    with open(dataset.get_path('characters.tsv'), 'w') as f:
        f.write(
            'ID\tCHARACTER_ID\tGROUP\tCHARACTER\tPINYIN\tDOCULECT\tSEMANTIC_CLASS\tPHONETIC_CLASS\tFANQIE\tRHYME_CLASS\tCHAPTER\tIMAGE\tTEXT\tCOMMENT\n'
        )
        idx = 1
        for k, vals in sorted(wordheads.items(), key=lambda x: x[0]):
            addons = []
            for key in [
                    'block', 'char', 'pinyin', 'doculect', 'radical',
                    'phonetic', 'fanqie', 'rhyme', 'chapter', 'img',
                    'explanation', 'notes'
            ]:
                val = vals.get(key, '')
                if isinstance(val, list):
                    val = ' / '.join(val)
                addons += [str(val).replace('\t', ' ')]
            f.write(str(idx) + '\t' + k + '\t' + '\t'.join(addons) + '\n')
            idx += 1
Example #9
0
def prepare(dataset):

    # correct wrong pinyins in sinopy
    pinyin = {
        "虱": "shī",
        "咯": "gē",
        "強": "qiáng",
        "哩": "lǐ",
        "喏": "nuò",
        "鳧": "fú",
        "伲": "nǐ",
        "黃": "huáng",
        "哋": "dì",
        "阿": "ā",
        "卵": "luǎn",
        "說": "shuō",
        "喙": "huì",
        "頸": "jǐng",
        "唔": "wú}",
        "雞": "jī",
        "黒": "hēi",
        "哪": "nǎ",
        "麼": "me",
        "蔃": "qiáng",
        "葷": "hūn",
        "鳥": "niǎo}",
        "舌": "huà",
        "吃": "chī",
        "膘": "biǎo}",
        "綠": "lǜ",
        "羽": "yǔ",
        "們": "men",
        "焦": "jiāo",
        "腳": "jiǎo",
        "乜": "miē",
        "即": "jí",
        "佬": "lǎo"
    }

    wl = Wordlist(dataset.get_path('raw', 'D_wang-2006.tsv'))
    concepts = dict([
        (x.english, x.concepticon_id)
        for x in Concepticon().conceptlists['Wang-2006-200'].concepts.values()
    ])
    D = {}
    och = csv2list(dataset.get_path('raw', 'D_old_chinese.csv'))
    nidx = max([k for k in wl]) + 1

    wl.add_entries('concepticon_id', 'concept', lambda x: concepts[x])
    wl.add_entries('doculect_in_source', 'doculect', lambda x: x)

    for k in wl:
        doculect = wl[k, 'doculect'].replace('_B', '')
        D[k] = [
            wl[k, h] for h in [
                'doculect', 'doculect_in_source', 'concept', 'concepticon_id',
                'ipa', 'partial'
            ]
        ]

    D[0] = [
        'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value',
        'characters'
    ]
    for a, chars in och:
        for char in chars.split(','):
            if char != '-':
                D[nidx] = [
                    'Old_Chinese', 'Old_Chinese', a, concepts[a], char, char
                ]
                nidx += 1

    wl2 = Wordlist(D)
    renumber_partial(wl2, name='cogids', partial_cognates='characters')
    wl2.output('tsv',
               filename=dataset.get_path('words'),
               ignore='all',
               prettify=False)

    # we also write the characters
    C = [[
        'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT',
        'DOCULECT', 'POSITION'
    ]]
    idx = 1
    errors = {}
    for k in wl2:
        concept = wl2[k, 'concept']
        doculect = wl2[k, 'doculect']
        chars = sinopy.gbk2big5(wl2[k, 'value'])
        cogids = wl2[k, 'cogids'].split(' ')
        for i, (char, cogid) in enumerate(zip(chars, cogids)):
            if sinopy.is_chinese(char):
                py = sinopy.pinyin(char)
                py = pinyin.get(char, py)
                if '?' in py or '{' in py:
                    if char in errors:
                        pass
                    else:
                        errors[char] = py
                C += [[idx, char, py, cogid, k, concept, doculect, i]]
                idx += 1
    for k, v in errors.items():
        print('"' + k + '" : "' + v + '",')
    with open(dataset.get_path('characters.tsv'), 'w') as f:
        for line in C:
            f.write('\t'.join([str(x) for x in line]) + '\n')