def make_topwords(lang, letters): try: tw = pd.read_csv(f"../corpora/wikis/{lang}-topwords.csv", index_col=0) except FileNotFoundError: logger.info(f"Generating {lang} top words") pc = make_wordlist(lang) pc = pc / pc.sum() gb = pc.groupby(pc.index.map(lambda s: partial_strip( ( fold_vietnamese(s) if lang=="vi" else hangul_jamo.decompose(s) if lang == "ko" else s ), letters)[0:1])) d = {c: [matches.index[0], matches[0]] for c in tqdm(letters) for matches in [gb.get_group(c) if c in gb.groups else []] if len(matches) > 0} tw = pd.DataFrame.from_dict(d, orient="index", columns=["word", "pc"]) tw.to_csv(f"../corpora/wikis/{lang}-topwords.csv") if lang == "ko": tw = tw[~tw.word.map(lambda s: hangul_jamo.is_jamo_character(s[0]))] return tw
from PIL import Image import numpy as np import matplotlib.pyplot as plt list = [] list2 = [] printlist = "" year, month, day = map(int, input("생년월일을 입력하세요(ex 1993 10 08) : ").split()) this_year = 2018 - year name = input("이름을 입력하세요 : ") calendar_year = int(input("알고 싶은 해를 입력하세요 : ")) name2 = name[1:] name1 = name[0] name_dec = hangul_jamo.decompose(name) name_dec1 = hangul_jamo.decompose(name1) name_dec2 = hangul_jamo.decompose(name2) sum = year + month + day jaum_sum = 0 moum_sum = 0 for i in range(0, 80): sumx = str(sum + i) birthsum = 0 for i in range(len(sumx)): birthsum += int(sumx[i]) if birthsum > 22: birthsum = str(birthsum) birthsum = int(birthsum[0]) + int(birthsum[1]) list.append(birthsum)
def split_jamos(word): return hangul_jamo.decompose(word)
def isGoodToken(token): #if (ucategory(token[0]) in {r'Lt', r'Lu'} and (lang != 'de')): return False if ((ucategory(token[0])[0] != r'L') and (ucategory(token[-1])[0] != r'L')): return False return token if ((len(sys.argv) != 2) or ((not os.path.isfile(sys.argv[1]) and (sys.argv[1] != r'-')))): sys.stderr.write("Usage: '" + sys.argv[0] + "' TEXT_FILE|-\n") exit(1) elif (sys.argv[1] == r'-'): bulk = sys.stdin.buffer.read().decode( errors='ignore').translate(junkToSpace) bulk = hangul_jamo.decompose(normalize('NFC', bulk.casefold())) bulk = [t for t in tokenize(re.sub(r'\s+', r' ', bulk)) if isGoodToken(t)] out = sys.__stdout__ else: bulk = re.sub(r'\s+', r' ', open(sys.argv[1], 'rb').read().decode(errors='ignore')) bulk = [ t for t in tokenize(bulk.translate(junkToSpace).casefold()) if isGoodToken(t) ] bulk = [hangul_jamo.decompose(t) for t in bulk] out = open(sys.argv[1], 'w') random.shuffle(bulk) sys.stderr.write(str(len(bulk)) + "\n") out.write(re.sub(r'\s+', r' ', (r' ' + r' '.join(bulk) + r' ')))
compose_jamo_characters('ㄱ', 'ㅏ', None)) # compose_jamo_characters("ㄱ", "ㅏ", None) == 가 print('compose_jamo_characters("ㄱ", "ㅏ") ==', compose_jamo_characters('ㄱ', 'ㅏ')) # compose_jamo_characters("ㄱ", "ㅏ") == 가 print('compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") ==', compose_jamo_characters('ㄱ', 'ㅏ', 'ㅎ')) # compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") == 갛 # 4. DECOMPOSING HANGUL SYLLABLES print('decompose_syllable("가") ==', decompose_syllable('가')) # decompose_syllable("가") == ('ㄱ', 'ㅏ', None) print('decompose_syllable("갛") ==', decompose_syllable('갛')) # decompose_syllable("갛") == ('ㄱ', 'ㅏ', 'ㅎ') # 4.1. USING UNPACKING ARGUMENTS OPERATOR * print('compose_jamo_characters(*decompose_syllable("가")) ==', compose_jamo_characters(*decompose_syllable('가'))) # compose_jamo_characters(*decompose_syllable("가")) == 가 print('compose_jamo_characters(*decompose_syllable("갛")) ==', compose_jamo_characters(*decompose_syllable('갛'))) # compose_jamo_characters(*decompose_syllable("갛")) == 갛 # 5. COMPOSING TEXT print('compose("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!") ==', compose('ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!')) # compose("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!") == 안녕하세요! Hello! # 6. DECOMPOSING TEXT print('decompose("안녕하세요! Hello!") ==', decompose('안녕하세요! Hello!')) # decompose("안녕하세요! Hello!") == ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!
def test_decompose(): assert decompose('대한민국은 민주공화국이다.') == 'ㄷㅐㅎㅏㄴㅁㅣㄴㄱㅜㄱㅇㅡㄴ ㅁㅣㄴㅈㅜㄱㅗㅇㅎㅘㄱㅜㄱㅇㅣㄷㅏ.' assert decompose( 'Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof' ) == 'Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof'
import argparse import hangul_jamo import re from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument('--input-file', type=str, required=True) parser.add_argument('--output-file', type=str, required=True) args = parser.parse_args() do_jamo = 'kor' in args.input_file.lower() # re_diacritics = re.compile(r"[ːᵝ̆ ͈ ̟ ̠ ̥ ̊ ̃ ̞ˀ˕̹]") # the above has too many spaces re_diacritics = re.compile(r"[ːᵝ͈̟̠̥̞̆̊̃ˀ˕̹]") with open(args.input_file) as inf, open(args.output_file, 'w') as ouf: for line in tqdm(inf): grapheme, phoneme = line.strip().split('\t') phoneme = re_diacritics.sub('', phoneme).replace('w͍', 'w') if do_jamo: grapheme = hangul_jamo.decompose(grapheme) ouf.write(f"{grapheme}\t{phoneme}\n")