def wikibooks(): with open('wikibooks.txt') as f: data = f.readlines() out = [] gsr = defaultdict(dict) for i, line in enumerate(data): line = strip_brackets(line.strip().replace('\t', ' '), brackets={'(': ')'}) if line.startswith('*'): if not line[1] == ' ': line = line.replace('*', '* ') elms = line.split(' ') if elms and len(elms) > 1: kgsc = elms[1].split('/') if len(kgsc) == 1: schuessler = '' karlgren = kgsc[0] elif len(kgsc) == 2: karlgren = kgsc[1] schuessler = kgsc[0] else: print('[ERROR:schuessler/karlgren] {0}'.format(line)) try: char = elms[2].split('|')[-1][0] except IndexError: print('[ERROR:character] {0}'.format(line)) char = '' mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]] if len(karlgren) not in [4, 5, 6]: print('[ERROR:karlgren] {0}'.format(line, karlgren)) elif not sinopy.is_chinese(char): print('[ERROR:char] {0}'.format(line)) elif char: pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' out += [(char, pinyin, 'Old_Chinese', karlgren[:4], karlgren, '', 'Karlgren1954')] for reading in mch: out += [(char, pinyin, 'Middle_Chinese', '', karlgren, reading, 'Wikibooks2016a')] gsr[char][reading] = [pinyin, reading, karlgren] with open('karlgren.tsv', 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n' ) for i, line in enumerate(out): f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n') return gsr
def prepare(dataset): with open(dataset.get_path('raw', 'wikibooks.txt')) as f: data = f.readlines() out = [] for i, line in enumerate(data): line = line.strip().replace('\t', ' ') if line.startswith('*'): if not line[1] == ' ': line = line.replace('*', '* ') elms = line.split(' ') if elms and len(elms) > 1: kgsc = elms[1].split('/') if len(kgsc) == 1: schuessler = kgsc[0] karlgren = '' elif len(kgsc) == 2: karlgren = kgsc[1] schuessler = kgsc[0] else: print('[ERROR:schuessler/karlgren] {0}'.format(line)) try: char = elms[2].split('|')[-1][0] except IndexError: print('[ERROR:character] {0}'.format(line)) char = '' mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]] #if len(karlgren) not in [4, 5, 6]: # print('[ERROR:karlgren] {0}'.format(line, karlgren)) if not sinopy.is_chinese(char): print('[ERROR:char] {0}'.format(line)) elif char: pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' out += [(char, pinyin, 'Old_Chinese', schuessler, karlgren, '', 'Schuessler2009')] for reading in mch: out += [(char, pinyin, 'Middle_Chinese', '', karlgren, reading, 'Wikibooks2016b')] with open(dataset.get_path('characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n' ) for i, line in enumerate(out): f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')
def prepare(dataset): with UnicodeReader(dataset.get_path('raw', 'O_shijing.tsv'), delimiter='\t') as reader: data = list(reader) header = [h.lower() for h in data[0]] C = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'SHIJING_NAME', 'SHJING_NUMBER', 'STANZA', 'VERSE', 'RHYME_CLASS', 'POSITION', 'TEXT', 'ORDER', 'SOURCE' )] for line in data[1:]: tmp = dict([(a, b.strip()) for a, b in zip(header, line)]) poem = '·'.join((tmp['block'], tmp['chapter'], tmp['title'])) poem_number = tmp['number'] stanza = tmp['stanza'] verse = tmp['verse'] char = tmp['character'] # get the position pos = str(tmp['raw_section'].index(char)) text = tmp['raw_section'] + tmp['endchar'] rhymeid = tmp['rhyme'] pinyin = sinopy.pinyin(char) order = tmp['section_number'] if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' C += [[tmp['id'], char, pinyin, 'Old_Chinese', poem, poem_number, stanza, verse, rhymeid, pos, text, order, 'Baxter1992']] with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join(line)+'\n')
def pinyin(self, chars): py = [] for char in chars: p = sinopy.pinyin(char) if '?' in p or sinopy.is_chinese(p): p = '' py += [p] return ' '.join(py)
def character_list(): sources = get_sources('characters.tsv') master = defaultdict(list) occs = defaultdict(list) doculects = set() for source in sources: print('[preparing]', source) ds = Dataset(source) for char in ds.characters.rows: occs[char] += [source] tmp = ds.characters.get_dict(row=char) readings = [] for t, chars in tmp.items(): for c in chars: _data = ( t, c, source, ) for h in [ 'reading', 'fanqie', 'phonetic_class', 'semantic_class', 'rhyme_class', 'wordfamily_class', 'source' ]: if ds.characters[c, h]: _data += (ds.characters[c, h], ) else: _data += ('', ) readings += [_data] for reading in readings: master[char].append(reading) doculects.add(t) table, idx = [], 1 for i, (char, vals) in enumerate(master.items()): if len(occs[char] ) == 2 and 'Guangyun' in occs[char] and 'Shuowen' in occs[char]: pass elif len(occs[char]) == 1 and ('Guangyun' in occs[char] or 'Shuowen' in occs[char]): pass else: pinyin = sinopy.pinyin(char) if sinopy.is_chinese(pinyin) or '?' in pinyin or '!' in pinyin: pinyin = '' for t, crossref, dataset, reading, fq, pc, sc, rc, wf, src in vals: table += [(idx, char, pinyin, t, reading, fq, pc, sc, rc, wf, src, dataset, crossref)] idx += 1 with open(cddb_path('datasets', 'characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tREADING\tFANQIE\tPHONETIC_CLASS\tSEMANTIC_CLASS\tRHYME_CLASS\tWORDFAMILY_CLASS\tSOURCE\tDATASET\tDATASET_ID\n' ) for line in table: f.write('\t'.join([str(x) for x in line]) + '\n')
def aline(line, warnings): comment = '' problem = False problems = [''] rhymewords = [] reconstruction = '' previous = '' inbrackets, incomment, outbracket = False, False, False stripped_line = '' for char in line: if char == ' ': pass elif char == '<': problem = True elif char == '>': problem = False problems += [''] elif char == '[': inbrackets = True elif inbrackets and char != ']': reconstruction += char elif char == ']': inbrackets=False if rhymewords: rhymewords[-1] += [reconstruction] else: rhymewords += [['???', previous, reconstruction]] warnings += 1 reconstruction = '' outbracket = True elif char in '((': incomment = True elif char in '))': incomment = False elif incomment: comment += char else: if problem: problems[-1] += char elif sinopy.is_chinese(char): if outbracket: rhymewords[-1][0] = char outbracket = False previous = char stripped_line += char elif char in '?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz': rhymewords += [['¿', char]] return stripped_line, rhymewords, comment, warnings, ','.join(problems)
def gbk_and_big5(self, chars): out1, out2 = '', '' for char in chars: if sinopy.is_chinese(char): idx1, idx2 = sinopy._cd.GBK.find(char), sinopy._cd.BIG5.find( char) if idx1 > -1: out1 += char out2 += sinopy._cd.BIG5[idx1] elif idx2 > -1: out1 += sinopy._cd.GBK[idx2] out2 += char else: out1 += char out2 += char return out1, out2
def prepare(dataset): with open(dataset.get_path('raw', 'sbgy.xml')) as f: data = f.readlines() D = [('ID', 'CHARACTER_ID', 'CHARACTER', 'DOCULECT', 'PINYIN', 'READING', 'FANQIE', 'RHYME_ID', 'RHYME_NUMBER', 'VOLUME', 'NOTE', 'SOURCE')] volume, rhyme_id, rhyme, ipa, fanqie, text = '', '', '', '', '', '' idx = 1 for line in data: if '<volume id' in line: volume = re.findall('id="(.*?)"', line)[0] if 'rhyme id="' in line: rhyme_id = re.findall('id="(.*?)"', line)[0] if rhyme_id: if 'rhyme_num' in line: rhyme = re.findall('>(.*?)<', line)[0] if 'voice_part ipa' in line: ipa = re.findall('ipa="(.*?)"', line)[0] if '/word_head' in line and text: text = text.replace('\n', ' ').strip() charid, char = re.findall( '<word_head id="(.*?)">(.*?)<', text)[0] note = re.findall('<note>(.*?)</note>', text) note = note[0] if note else '' if 'fanqie' in text: fanqie = re.findall( '<fanqie>(.*?)</fanqie>', text)[0] pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' if sinopy.is_chinese(char.strip()): D += [(str(idx), charid, char.strip(), 'Middle_Chinese', pinyin, ipa, fanqie.strip(), rhyme_id.strip(), rhyme.strip(), volume, note.strip(), 'Zhou1938')] idx += 1 else: print('[ERROR]', char, sinopy.is_chinese(char), ipa, fanqie) text = '' if text: text += line if 'word_head id' in line: text = line with open(dataset.get_path('characters.tsv'), 'w') as f: for line in D: f.write('\t'.join([l.replace('\t', '') for l in line])+'\n')
def prepare(dataset): img_url = "http://kanji-database.sourceforge.net/dict/swjz/swjz-img/" with open(dataset.get_path('raw', 'swjz.xml')) as f: data = f.readlines() wordheads = {} blocks, chapter, idx = {}, '', 1 blocks[idx] = '' for i, line in enumerate(data): if '<chaptertitle id' in line: chapter = re.findall('id="(.*?)"', line)[0] if blocks[idx]: blocks[idx]['text'] += '\n' + line.strip() if '<shuowen>' in line: blocks[idx] = dict(text=line.strip(), chapter=chapter) if '</shuowen>' in line: idx += 1 blocks[idx] = False for i, block in [(a, b) for a, b in blocks.items() if b]: for line in block['text'].split('\n'): if 'wordhead' in line: wid, img, char = re.findall('id="(.*?)" img="(.*?)">(.*?)<', line)[0] pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' wordheads[wid] = dict(explanations=[], char=char, notes=[], img=img, block=i + 1, pinyin=pinyin, doculect='Old_Chinese', chapter=block['chapter']) if 'explanation>' in line: wordheads[wid]['explanations'] += [ re.findall('>(.*?)<', line)[0] ] structure = re.findall('从(.)。(.)聲', line) if structure: wordheads[wid]['radical'] = structure[0][0] wordheads[wid]['phonetic'] = structure[0][1] if 'duan_note>' in line: wordheads[wid]['notes'] += [re.findall('>(.*?)<', line)[0]] fq = re.findall('>(..)切。', line) if fq and sinopy.is_chinese(fq[0]): wordheads[wid]['fanqie'] = fq[0] bu = re.findall('(.)部。', line) if bu and sinopy.is_chinese(bu): wordheads[wid]['rhyme'] = bu[0] with open(dataset.get_path('characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER_ID\tGROUP\tCHARACTER\tPINYIN\tDOCULECT\tSEMANTIC_CLASS\tPHONETIC_CLASS\tFANQIE\tRHYME_CLASS\tCHAPTER\tIMAGE\tTEXT\tCOMMENT\n' ) idx = 1 for k, vals in sorted(wordheads.items(), key=lambda x: x[0]): addons = [] for key in [ 'block', 'char', 'pinyin', 'doculect', 'radical', 'phonetic', 'fanqie', 'rhyme', 'chapter', 'img', 'explanation', 'notes' ]: val = vals.get(key, '') if isinstance(val, list): val = ' / '.join(val) addons += [str(val).replace('\t', ' ')] f.write(str(idx) + '\t' + k + '\t' + '\t'.join(addons) + '\n') idx += 1
def prepare_old2(ds): converter = { '豬肉': '肉', '豬艤': '艤', '! □水': '口水', '! 一□水': '一口水', '星〔星兒〕': '星', "一串兒葡萄": "一串葡萄", "一小片兒草": "一小片草", "一串兒葡萄": "一串葡萄", "一抓兒葡萄": "一抓葡萄", "手套兒": "手套", "茄兒如": "茄如", "前兒日": "前日", "前兒個": "前個", "明兒個": "明個", "明兒個": "明個", "今兒個": "今個", "今兒日": "今日", "黃花兒魚": "黃花魚", "大前兒個": "大前個", "大前兒日": "大前日", "大後兒個": "大後個", } bad_list = [] visited = [] inv = ds.sounds words = Wordlist(ds.raw('words-2017-06-16.tsv')) weilist = [] pids = {} pidx = 1 characters, partialcogs = {}, {} blacklist = [] for idx, bid, segments, chars, note in iter_rows(words, 'beida_id', 'segments', 'hanzi', 'note'): if 'ignore' in note: blacklist += [idx] else: ochars = chars chars = converter.get(chars, chars) chars = re.sub('〔[^〕]+〕', '', chars) chars = re.sub('<[^>]+>', '', chars) chars = chars.replace('□', '囗') chars = chars.replace('?', '') chars = ''.join( [c for c in chars.split(',')[0] if sp.is_chinese(c)]) tks = tokens2morphemes(segments) partials = [] if len(tks) == len(chars): for char in chars: if char in pids and char != '囗': partials += [str(pids[char])] else: pids[char] = pidx pidx += 1 partials += [str(pids[char])] else: if chars.endswith('兒'): if len(chars) - 1 == len(tks): for char in chars[:-1]: if char in pids and char != '囗': partials += [str(pids[char])] else: pids[char] = pidx pidx += 1 partials += [str(pids[char])] else: for tk in tks: partials += [str(pidx)] pidx += 1 bad_list += [idx] print(len(bad_list), chars, len(tks), bid) elif not chars: weilist += [idx] for tk in tks: partials += [str(pidx)] pidx += 1 chars = '?' + chars elif '囗' in chars: weilist += [idx] for tk in tks: partials += [str(pidx)] pidx += 1 chars = '!' + chars else: for tk in tks: partials += [str(pidx)] pidx += 1 bad_list += [idx] print(len(bad_list), ochars, '|', '\t|', chars, len(tks), bid) chars = ':' + chars characters[idx] = chars partialcogs[idx] = ' '.join(partials) print(len(weilist)) words.output('tsv', filename=ds.raw('words.tmp'), subset=True, rows=dict(ID='not in ' + str(blacklist))) words = Wordlist(ds.raw('words.tmp.tsv')) words.add_entries('benzi', characters, lambda x: x) words.add_entries('cogids', partialcogs, lambda x: x) ds.write_wordlist(words)
def prepare(ds): convert = { k: v for k, v in [('ʻ', 'ʻ'), ('‘', '‘'), ('‘', '‘'), ("'", '‘'), ( '"', '""'), ('〜', '~'), ('A', 'ᴀ'), ('□', '口'), ('口', '口'), ( 'Ã', 'ᴀ̃'), ('E', 'ᴇ'), ('Ẽ', 'ᴇ̃'), ('ʔd', 'ʔd'), ( 'ʔt', 'ʔt'), ('ʔp', 'ʔp'), ('ʔb', 'ʔb'), ('ʔg', 'ʔg'), ('ʔk', 'ʔk'), ('I', 'ɪ'), ('Ɣ', 'ɣ')] } translate = {'中古音韵': '中古汉语'} data = csv2list(ds.raw('Hou-2004-characters.corrected.tsv')) headers = csv2list(ds.raw('headers.txt')) chin2cddb = {x['hanzi']: y for y, x in ds.languages.items()} header = {} for line in headers: idx, chars = line[0].split(' ') lng, srt = ds.gbk_and_big5(chars) char = lng[0] pinyin = ds.pinyin(char) header[line[0][1:-1]] = [char, lng, srt, pinyin] D = { 0: [ 'doculect', 'character', 'pinyin', 'value', 'segments', 'structure', 'cognate_class' ] } idx = 1 prf = defaultdict(lambda: defaultdict(int)) for line in data: if len(line) == 3: a, b, c = line d = [] elif len(line) == 4: a, b, c, d = line else: a = False if a: doculect = chin2cddb[translate.get(a, a)] cogid = b[1:-1] txt = ''.join([ convert.get(x, x) for x in c if x not in '\t \n\r"' and not sinopy.is_chinese(x) ]) if doculect != 'Middle_Chinese': for r in slice_word( ipa2tokens(txt, merge_vowels=False, semi_diacritics="ɕ‘'ʑsz'ʂʐʃʒf", expand_nasals=True)): if '~' in r: pass else: tks = ''.join(r) i, f, t = ds.split_initial_final(tks) if f and t: if i: prf[i, 'i'][doculect] += 1 prf[f, 'f'][doculect] += 1 prf[t, 't'][doculect] += 1 elif f: if i: prf[i, 'i'][doculect] += 1 prf[f, 'f'][doculect] += 1 else: prf[i + f + t, 's'][doculect] += 1 ds.write_profile(ds.raw('hou-characters.prf'), prf)
def chinese(self, string): return sinopy.is_chinese(string)
def prepare(dataset): with open(dataset.get_path('raw', '__private__schuessler.txt')) as f: data = f.readlines() D = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'WORDFAMILY_CLASS', 'GLOSS', 'READING', 'VARIANT_CLASS', 'SOURCE')] idf = '' idx = 1 for line in data: if line.startswith('ENTRY'): if idf and sinopy.is_chinese(char.strip()): if len(char) > 1 and not '-' in pinyin: variant = char[0] chars = list(char) else: chars = [char] variant = '' for char in chars: if mch: D += [(idx, char, pinyin, 'Middle_Chinese', '', gloss, mch, variant, 'Schuessler2007')] idx += 1 if ocb: D += [(idx, char, pinyin, 'Old_Chinese', '', gloss, ocb, variant, 'Baxter1992')] idx += 1 if ocm: D += [(idx, char, pinyin, 'Old_Chinese', anc, gloss, ocm, variant, 'Schuessler2007')] idx += 1 if lhc: D += [(idx, char, pinyin, 'Late_Han_Chinese', '', gloss, lhc, variant, 'Schuessler2007')] idx += 1 idf = line[6:].strip() gloss, pinyin, anc, char, mch, ocb, ocm, lhc = ('', '', '', '', '', '', '', '') if line.startswith('HEAD'): if '⪤' in line: anc, line = line[5:].split('⪤') anc = '⪤ ' + anc elif '~' in line: anc, line = line[5:].split('~') anc = '~ ' + anc elif '=' in line: anc, line = line[5:].split('=') anc = '= ' + anc else: anc, line = '', line[5:] line = line.strip() anc = anc.strip() if line.count(' ') == 1: pinyin, char = line.split(' ') else: idf = '' print('[Problem]: {0}'.format(line)) if line.startswith('LH'): lhc = line[14:].strip() if line.startswith('GLOSS'): if gloss: gloss += '/' + line[6:].strip() else: gloss = line[6:].strip() if line.startswith('MC'): mch = line[18:].strip() if line.startswith('OCB'): ocb = line[25:].strip() if line.startswith('OCM'): ocm = line[26:].strip() with open(dataset.get_path('characters.tsv'), 'w') as f: for i, line in enumerate(D): f.write('\t'.join([str(x) for x in line]) + '\n')
def prepare(dataset): # correct wrong pinyins in sinopy pinyin = { "虱": "shī", "咯": "gē", "強": "qiáng", "哩": "lǐ", "喏": "nuò", "鳧": "fú", "伲": "nǐ", "黃": "huáng", "哋": "dì", "阿": "ā", "卵": "luǎn", "說": "shuō", "喙": "huì", "頸": "jǐng", "唔": "wú}", "雞": "jī", "黒": "hēi", "哪": "nǎ", "麼": "me", "蔃": "qiáng", "葷": "hūn", "鳥": "niǎo}", "舌": "huà", "吃": "chī", "膘": "biǎo}", "綠": "lǜ", "羽": "yǔ", "們": "men", "焦": "jiāo", "腳": "jiǎo", "乜": "miē", "即": "jí", "佬": "lǎo" } wl = Wordlist(dataset.get_path('raw', 'D_wang-2006.tsv')) concepts = dict([ (x.english, x.concepticon_id) for x in Concepticon().conceptlists['Wang-2006-200'].concepts.values() ]) D = {} och = csv2list(dataset.get_path('raw', 'D_old_chinese.csv')) nidx = max([k for k in wl]) + 1 wl.add_entries('concepticon_id', 'concept', lambda x: concepts[x]) wl.add_entries('doculect_in_source', 'doculect', lambda x: x) for k in wl: doculect = wl[k, 'doculect'].replace('_B', '') D[k] = [ wl[k, h] for h in [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'ipa', 'partial' ] ] D[0] = [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value', 'characters' ] for a, chars in och: for char in chars.split(','): if char != '-': D[nidx] = [ 'Old_Chinese', 'Old_Chinese', a, concepts[a], char, char ] nidx += 1 wl2 = Wordlist(D) renumber_partial(wl2, name='cogids', partial_cognates='characters') wl2.output('tsv', filename=dataset.get_path('words'), ignore='all', prettify=False) # we also write the characters C = [[ 'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT', 'DOCULECT', 'POSITION' ]] idx = 1 errors = {} for k in wl2: concept = wl2[k, 'concept'] doculect = wl2[k, 'doculect'] chars = sinopy.gbk2big5(wl2[k, 'value']) cogids = wl2[k, 'cogids'].split(' ') for i, (char, cogid) in enumerate(zip(chars, cogids)): if sinopy.is_chinese(char): py = sinopy.pinyin(char) py = pinyin.get(char, py) if '?' in py or '{' in py: if char in errors: pass else: errors[char] = py C += [[idx, char, py, cogid, k, concept, doculect, i]] idx += 1 for k, v in errors.items(): print('"' + k + '" : "' + v + '",') with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join([str(x) for x in line]) + '\n')