def prepare(dataset): with UnicodeReader(dataset.get_path('raw', 'O_shijing.tsv'), delimiter='\t') as reader: data = list(reader) header = [h.lower() for h in data[0]] C = [('ID', 'CHARACTER', 'PINYIN', 'DOCULECT', 'SHIJING_NAME', 'SHJING_NUMBER', 'STANZA', 'VERSE', 'RHYME_CLASS', 'POSITION', 'TEXT', 'ORDER', 'SOURCE' )] for line in data[1:]: tmp = dict([(a, b.strip()) for a, b in zip(header, line)]) poem = '·'.join((tmp['block'], tmp['chapter'], tmp['title'])) poem_number = tmp['number'] stanza = tmp['stanza'] verse = tmp['verse'] char = tmp['character'] # get the position pos = str(tmp['raw_section'].index(char)) text = tmp['raw_section'] + tmp['endchar'] rhymeid = tmp['rhyme'] pinyin = sinopy.pinyin(char) order = tmp['section_number'] if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' C += [[tmp['id'], char, pinyin, 'Old_Chinese', poem, poem_number, stanza, verse, rhymeid, pos, text, order, 'Baxter1992']] with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join(line)+'\n')
def main(): debug = False if 'debug' in argv or '--debug' in argv: debug = True if 'pinyin' in argv: py = sinopy.pinyin(argv[argv.index('pinyin') + 1]) print(py) if 'profile' in argv: if '--cldf' in argv: wl = Wordlist.from_cldf(argv[argv.index('profile') + 1], col='language_id', row='parameter_id') wl.add_entries('doculect', 'language_name', lambda x: x) else: wl = Wordlist(argv[argv.index('profile') + 1]) column = 'ipa' language = None filename = 'orthography.tsv' if '--column' in argv: column = argv[argv.index('--column') + 1] if '--language' in argv: language = argv[argv.index('--language') + 1] if '-l' in argv: language = argv[argv.index('-l') + 1] if '-o' in argv: filename = argv[argv.index('-o') + 1] if '--filename' in argv: filename = argv[argv.index('--filename') + 1] segments.write_structure_profile(wl, column=column, filename=filename, debug=debug, language=language)
def pinyin(self, chars): py = [] for char in chars: p = sinopy.pinyin(char) if '?' in p or sinopy.is_chinese(p): p = '' py += [p] return ' '.join(py)
def character_list(): sources = get_sources('characters.tsv') master = defaultdict(list) occs = defaultdict(list) doculects = set() for source in sources: print('[preparing]', source) ds = Dataset(source) for char in ds.characters.rows: occs[char] += [source] tmp = ds.characters.get_dict(row=char) readings = [] for t, chars in tmp.items(): for c in chars: _data = ( t, c, source, ) for h in [ 'reading', 'fanqie', 'phonetic_class', 'semantic_class', 'rhyme_class', 'wordfamily_class', 'source' ]: if ds.characters[c, h]: _data += (ds.characters[c, h], ) else: _data += ('', ) readings += [_data] for reading in readings: master[char].append(reading) doculects.add(t) table, idx = [], 1 for i, (char, vals) in enumerate(master.items()): if len(occs[char] ) == 2 and 'Guangyun' in occs[char] and 'Shuowen' in occs[char]: pass elif len(occs[char]) == 1 and ('Guangyun' in occs[char] or 'Shuowen' in occs[char]): pass else: pinyin = sinopy.pinyin(char) if sinopy.is_chinese(pinyin) or '?' in pinyin or '!' in pinyin: pinyin = '' for t, crossref, dataset, reading, fq, pc, sc, rc, wf, src in vals: table += [(idx, char, pinyin, t, reading, fq, pc, sc, rc, wf, src, dataset, crossref)] idx += 1 with open(cddb_path('datasets', 'characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tREADING\tFANQIE\tPHONETIC_CLASS\tSEMANTIC_CLASS\tRHYME_CLASS\tWORDFAMILY_CLASS\tSOURCE\tDATASET\tDATASET_ID\n' ) for line in table: f.write('\t'.join([str(x) for x in line]) + '\n')
def wikibooks(): with open('wikibooks.txt') as f: data = f.readlines() out = [] gsr = defaultdict(dict) for i, line in enumerate(data): line = strip_brackets(line.strip().replace('\t', ' '), brackets={'(': ')'}) if line.startswith('*'): if not line[1] == ' ': line = line.replace('*', '* ') elms = line.split(' ') if elms and len(elms) > 1: kgsc = elms[1].split('/') if len(kgsc) == 1: schuessler = '' karlgren = kgsc[0] elif len(kgsc) == 2: karlgren = kgsc[1] schuessler = kgsc[0] else: print('[ERROR:schuessler/karlgren] {0}'.format(line)) try: char = elms[2].split('|')[-1][0] except IndexError: print('[ERROR:character] {0}'.format(line)) char = '' mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]] if len(karlgren) not in [4, 5, 6]: print('[ERROR:karlgren] {0}'.format(line, karlgren)) elif not sinopy.is_chinese(char): print('[ERROR:char] {0}'.format(line)) elif char: pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' out += [(char, pinyin, 'Old_Chinese', karlgren[:4], karlgren, '', 'Karlgren1954')] for reading in mch: out += [(char, pinyin, 'Middle_Chinese', '', karlgren, reading, 'Wikibooks2016a')] gsr[char][reading] = [pinyin, reading, karlgren] with open('karlgren.tsv', 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n' ) for i, line in enumerate(out): f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n') return gsr
def prepare(dataset): with open(dataset.get_path('raw', 'wikibooks.txt')) as f: data = f.readlines() out = [] for i, line in enumerate(data): line = line.strip().replace('\t', ' ') if line.startswith('*'): if not line[1] == ' ': line = line.replace('*', '* ') elms = line.split(' ') if elms and len(elms) > 1: kgsc = elms[1].split('/') if len(kgsc) == 1: schuessler = kgsc[0] karlgren = '' elif len(kgsc) == 2: karlgren = kgsc[1] schuessler = kgsc[0] else: print('[ERROR:schuessler/karlgren] {0}'.format(line)) try: char = elms[2].split('|')[-1][0] except IndexError: print('[ERROR:character] {0}'.format(line)) char = '' mch = [x[:-1] if x.endswith(',') else x for x in elms[3:]] #if len(karlgren) not in [4, 5, 6]: # print('[ERROR:karlgren] {0}'.format(line, karlgren)) if not sinopy.is_chinese(char): print('[ERROR:char] {0}'.format(line)) elif char: pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' out += [(char, pinyin, 'Old_Chinese', schuessler, karlgren, '', 'Schuessler2009')] for reading in mch: out += [(char, pinyin, 'Middle_Chinese', '', karlgren, reading, 'Wikibooks2016b')] with open(dataset.get_path('characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER\tPINYIN\tDOCULECT\tPHONETIC_CLASS\tKARLGREN_ID\tREADING\tSOURCE\n' ) for i, line in enumerate(out): f.write(str(i + 1) + '\t' + '\t'.join(line) + '\n')
def prepare(dataset): with open(dataset.get_path('raw', 'sbgy.xml')) as f: data = f.readlines() D = [('ID', 'CHARACTER_ID', 'CHARACTER', 'DOCULECT', 'PINYIN', 'READING', 'FANQIE', 'RHYME_ID', 'RHYME_NUMBER', 'VOLUME', 'NOTE', 'SOURCE')] volume, rhyme_id, rhyme, ipa, fanqie, text = '', '', '', '', '', '' idx = 1 for line in data: if '<volume id' in line: volume = re.findall('id="(.*?)"', line)[0] if 'rhyme id="' in line: rhyme_id = re.findall('id="(.*?)"', line)[0] if rhyme_id: if 'rhyme_num' in line: rhyme = re.findall('>(.*?)<', line)[0] if 'voice_part ipa' in line: ipa = re.findall('ipa="(.*?)"', line)[0] if '/word_head' in line and text: text = text.replace('\n', ' ').strip() charid, char = re.findall( '<word_head id="(.*?)">(.*?)<', text)[0] note = re.findall('<note>(.*?)</note>', text) note = note[0] if note else '' if 'fanqie' in text: fanqie = re.findall( '<fanqie>(.*?)</fanqie>', text)[0] pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' if sinopy.is_chinese(char.strip()): D += [(str(idx), charid, char.strip(), 'Middle_Chinese', pinyin, ipa, fanqie.strip(), rhyme_id.strip(), rhyme.strip(), volume, note.strip(), 'Zhou1938')] idx += 1 else: print('[ERROR]', char, sinopy.is_chinese(char), ipa, fanqie) text = '' if text: text += line if 'word_head id' in line: text = line with open(dataset.get_path('characters.tsv'), 'w') as f: for line in D: f.write('\t'.join([l.replace('\t', '') for l in line])+'\n')
def prepare(dataset): img_url = "http://kanji-database.sourceforge.net/dict/swjz/swjz-img/" with open(dataset.get_path('raw', 'swjz.xml')) as f: data = f.readlines() wordheads = {} blocks, chapter, idx = {}, '', 1 blocks[idx] = '' for i, line in enumerate(data): if '<chaptertitle id' in line: chapter = re.findall('id="(.*?)"', line)[0] if blocks[idx]: blocks[idx]['text'] += '\n' + line.strip() if '<shuowen>' in line: blocks[idx] = dict(text=line.strip(), chapter=chapter) if '</shuowen>' in line: idx += 1 blocks[idx] = False for i, block in [(a, b) for a, b in blocks.items() if b]: for line in block['text'].split('\n'): if 'wordhead' in line: wid, img, char = re.findall('id="(.*?)" img="(.*?)">(.*?)<', line)[0] pinyin = sinopy.pinyin(char) if '?' in pinyin or sinopy.is_chinese(pinyin): pinyin = '' wordheads[wid] = dict(explanations=[], char=char, notes=[], img=img, block=i + 1, pinyin=pinyin, doculect='Old_Chinese', chapter=block['chapter']) if 'explanation>' in line: wordheads[wid]['explanations'] += [ re.findall('>(.*?)<', line)[0] ] structure = re.findall('从(.)。(.)聲', line) if structure: wordheads[wid]['radical'] = structure[0][0] wordheads[wid]['phonetic'] = structure[0][1] if 'duan_note>' in line: wordheads[wid]['notes'] += [re.findall('>(.*?)<', line)[0]] fq = re.findall('>(..)切。', line) if fq and sinopy.is_chinese(fq[0]): wordheads[wid]['fanqie'] = fq[0] bu = re.findall('(.)部。', line) if bu and sinopy.is_chinese(bu): wordheads[wid]['rhyme'] = bu[0] with open(dataset.get_path('characters.tsv'), 'w') as f: f.write( 'ID\tCHARACTER_ID\tGROUP\tCHARACTER\tPINYIN\tDOCULECT\tSEMANTIC_CLASS\tPHONETIC_CLASS\tFANQIE\tRHYME_CLASS\tCHAPTER\tIMAGE\tTEXT\tCOMMENT\n' ) idx = 1 for k, vals in sorted(wordheads.items(), key=lambda x: x[0]): addons = [] for key in [ 'block', 'char', 'pinyin', 'doculect', 'radical', 'phonetic', 'fanqie', 'rhyme', 'chapter', 'img', 'explanation', 'notes' ]: val = vals.get(key, '') if isinstance(val, list): val = ' / '.join(val) addons += [str(val).replace('\t', ' ')] f.write(str(idx) + '\t' + k + '\t' + '\t'.join(addons) + '\n') idx += 1
def prepare(dataset): # correct wrong pinyins in sinopy pinyin = { "虱": "shī", "咯": "gē", "強": "qiáng", "哩": "lǐ", "喏": "nuò", "鳧": "fú", "伲": "nǐ", "黃": "huáng", "哋": "dì", "阿": "ā", "卵": "luǎn", "說": "shuō", "喙": "huì", "頸": "jǐng", "唔": "wú}", "雞": "jī", "黒": "hēi", "哪": "nǎ", "麼": "me", "蔃": "qiáng", "葷": "hūn", "鳥": "niǎo}", "舌": "huà", "吃": "chī", "膘": "biǎo}", "綠": "lǜ", "羽": "yǔ", "們": "men", "焦": "jiāo", "腳": "jiǎo", "乜": "miē", "即": "jí", "佬": "lǎo" } wl = Wordlist(dataset.get_path('raw', 'D_wang-2006.tsv')) concepts = dict([ (x.english, x.concepticon_id) for x in Concepticon().conceptlists['Wang-2006-200'].concepts.values() ]) D = {} och = csv2list(dataset.get_path('raw', 'D_old_chinese.csv')) nidx = max([k for k in wl]) + 1 wl.add_entries('concepticon_id', 'concept', lambda x: concepts[x]) wl.add_entries('doculect_in_source', 'doculect', lambda x: x) for k in wl: doculect = wl[k, 'doculect'].replace('_B', '') D[k] = [ wl[k, h] for h in [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'ipa', 'partial' ] ] D[0] = [ 'doculect', 'doculect_in_source', 'concept', 'concepticon_id', 'value', 'characters' ] for a, chars in och: for char in chars.split(','): if char != '-': D[nidx] = [ 'Old_Chinese', 'Old_Chinese', a, concepts[a], char, char ] nidx += 1 wl2 = Wordlist(D) renumber_partial(wl2, name='cogids', partial_cognates='characters') wl2.output('tsv', filename=dataset.get_path('words'), ignore='all', prettify=False) # we also write the characters C = [[ 'ID', 'CHARACTER', 'PINYIN', 'WORDS_COGIDS', 'WORDS_ID', 'CONCEPT', 'DOCULECT', 'POSITION' ]] idx = 1 errors = {} for k in wl2: concept = wl2[k, 'concept'] doculect = wl2[k, 'doculect'] chars = sinopy.gbk2big5(wl2[k, 'value']) cogids = wl2[k, 'cogids'].split(' ') for i, (char, cogid) in enumerate(zip(chars, cogids)): if sinopy.is_chinese(char): py = sinopy.pinyin(char) py = pinyin.get(char, py) if '?' in py or '{' in py: if char in errors: pass else: errors[char] = py C += [[idx, char, py, cogid, k, concept, doculect, i]] idx += 1 for k, v in errors.items(): print('"' + k + '" : "' + v + '",') with open(dataset.get_path('characters.tsv'), 'w') as f: for line in C: f.write('\t'.join([str(x) for x in line]) + '\n')