def __init__(self, content, con): self.content = content self.content = [x.strip() for x in self.content.splitlines()] self.con = con self.pointer_char = "⎆" #➾ #➸ #❭ self.wrapper = textwrap.TextWrapper(placeholder="…") self.output = [] self.tipnumber = 0 self.mecab = mecablight.MeCabLight() self.lemmatizer = Lemmatizer() self.transliterator = Transliter(academic) self.kogrammarlinks = kogrammarlinks.KoGrammarLinks() #self.missing_words = [] self.particle_classes = ['E', 'J', 'S', 'X'] ###################### # connect to database and check tables are there self.cur = self.con.cursor()
def romanizeHangul(self, string): transliter = Transliter(academic) return transliter.translit(string.decode("utf-8")).replace("-", "") # t = Translator() # print t.translateIDKR("aku orang indonesia") # print t.translateKRID("나는 인도네시아") # print t.romanizeHangul("aa 나는 인도네시아 어이다.") # print t.preprocessKR("나는 인도네시아")
def kor2en(str): ad = AlphabetDetector() inputTitle = str outputTitle = "" # set invalid chars except . for extension invalidChars = set(string.punctuation.replace(".", "")) # replace invalid chars for i in range(len(inputTitle)): if inputTitle[i] not in invalidChars: outputTitle += inputTitle[i] i+=1 if not ad.only_alphabet_chars(outputTitle,"LATIN"): transliter = Transliter(academic) outputTitle = transliter.translit(outputTitle) return outputTitle
def encode_strings(string: list, v) -> dict: # here we encode the string in binary strings # and then back in numbers d = {} for i in range(len(string)): vector = list(v.vectorize(u"{x}".format(x=string[i]))) try: r = Romanizer(string[i]).romanize() except: transliter = Transliter(academic) transliter.translit(string[i]) normalizer(vector, 7) res = list(format(ord(i), '08b') for i in r) number = list(map(lambda w: (int(w, 2)), res)) normalizer(number, 7) double = "{hangul}/{roman}".format(hangul=string[i], roman=r) d[double] = ((vector, number)) return d
def transliter(text, code): if code == 'ja': textt = text kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator kakasi.setMode("C", True) # capitalize, default: no capitalize conv = kakasi.getConverter() result = conv.do(textt) return (result) elif code == 'ko': transliter = Transliter(academic) return transliter.translit(text) else: return translit(text, code, reversed=True)
def main(): global INPUT global OUTPUT parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', help='Input file', default=INPUT) parser.add_argument('-o', '--output', help='Output file', default=OUTPUT) args = parser.parse_args() INPUT = args.input OUTPUT = args.output transliter = Transliter(academic) convertRoman(transliter)
from discord.ext import commands import requests import json from bs4 import BeautifulSoup as bSoup from secret_keys import LF_API_KEY, GENIUS_CLIENT_KEY from db import users import os import calendar import time from lyricsgenius import Genius from hangul_romanize import Transliter from hangul_romanize.rule import academic transliter = Transliter(academic) genius = Genius(GENIUS_CLIENT_KEY) class LastFmCog(commands.Cog): """ LastFM stuff """ def __init__(self, bot): self.bot = bot self.__cog_name__ = "LastFM" @commands.command(aliases=['lf']) async def lastfm(self, ctx, *args): mentioned_users = ctx.message.mentions for mentioned_user in mentioned_users:
return reader(self.term, term, previous) return self.read_as(from_, from_, previous_terms, word_id, translation, table) @dataclasses.dataclass(frozen=True) class WesternTerm(Term): loan: str locale: Locale def romanize(self, locale: Locale) -> Markup: r = super().romanize(locale) return Markup(r.capitalize()) if self.loan[0].isupper() else r hangul_romanize_transliter = Transliter(academic) romanizers: Mapping[Locale, Callable[[str], Markup]] = { Locale.parse('ja'): lambda t: Markup(to_roma(t.replace(' ', ''))), Locale.parse('ko'): lambda t: Markup(hangul_romanize_transliter.translit(t.replace(' ', ''))), Locale.parse('zh_CN'): lambda t: Markup(to_pinyin(t).replace(' ', '')), Locale.parse('zh_HK'): lambda t: Markup( re.sub( r'(\d) ?', r'<sup>\1</sup>', t if re.match(r'^[A-Za-z0-9 ]+$', t) else pinyin_jyutping_sentence.jyutping(t, True, True))), Locale.parse('zh_TW'): lambda t: Markup(zhuyin_to_pinyin(to_zhuyin(t)).replace(' ', '')),
class KoInterlinear: def __init__(self, content, con): self.content = content self.content = [x.strip() for x in self.content.splitlines()] self.con = con self.pointer_char = "⎆" #➾ #➸ #❭ self.wrapper = textwrap.TextWrapper(placeholder="…") self.output = [] self.tipnumber = 0 self.mecab = mecablight.MeCabLight() self.lemmatizer = Lemmatizer() self.transliterator = Transliter(academic) self.kogrammarlinks = kogrammarlinks.KoGrammarLinks() #self.missing_words = [] self.particle_classes = ['E', 'J', 'S', 'X'] ###################### # connect to database and check tables are there self.cur = self.con.cursor() def get_sejongtagset_name(self, tag): tag = tag.split("+")[0] # go with first tag if multiple if tag in sejongtagset.SejongTagset: return sejongtagset.SejongTagset[tag][0] else: return "" def get_sejongtagset_superclass(self, tag): tag = tag.split("+")[0] # go with first tag if multiple if tag in sejongtagset.SejongTagset: return sejongtagset.SejongTagset[tag][1] else: print("Note: can't find " + tag + " in tagset!") return "" def get_sejongtagset_abbrev(self, tag): tag = tag.split("+")[0] # go with first tag if multiple if tag in sejongtagset.SejongTagset: return sejongtagset.SejongTagset[tag][2] else: print("Note: can't find " + tag + " in tagset!") return "" def get_trans_fetch(self, word, original_word): self.cur.execute( "SELECT word, def, extradata FROM korean_english WHERE word = %s ORDER BY extradata, wordid;", (word, )) rows = self.cur.fetchall() if type(rows) is list and rows: return [(row[1] if word == original_word else "[" + row[0] + "] " + row[1]) for row in rows if row is not None and row[1] != None and row[1] != ""] else: return [] def fetch_phrase_translations(self, wordstr, nextwordstr=None, nextnextwordstr=None): if wordstr is not None and wordstr != "" and nextwordstr != "" and nextnextwordstr != "": if nextwordstr is None or nextwordstr == "": Exception("fetch_phrase_translations sent None for word2.") if nextnextwordstr is not None and nextnextwordstr != "": self.cur.execute( "SELECT phrase, def AS trans FROM korean_english_phrase2word WHERE word1 = %s AND word2 = %s UNION SELECT phrase, def AS trans FROM korean_english_phrase3word WHERE word1 = %s AND word2 = %s AND word3 = %s;", (wordstr, nextwordstr, wordstr, nextwordstr, nextnextwordstr)) else: self.cur.execute( "SELECT phrase, def AS trans FROM korean_english_phrase2word WHERE word1 = %s AND word2 = %s;", (wordstr, nextwordstr)) else: return [] rows = self.cur.fetchall() if type(rows) is list and rows: return_rows = [ " ".join([row[0], row[1]]) for row in rows if row is not None and row[1] is not None and row[1] != "" ] return return_rows return [] def try_lemmatization_methods(self, branch, original_word, block=True): plain_word = self.get_plain_word(branch) is_verb = any(morph[1][0] == "V" for morph in branch if len(morph) > 1 and len(morph[1]) > 0) if plain_word == "": return [] #print("plain word " + plain_word) # first look up the given word in dictionary translations = self.get_trans_fetch(plain_word, original_word) if translations: return translations # then try lemmatizing with soylemma (returns multiple), see if in dic lemmatized_words = self.lemmatizer.lemmatize(plain_word) if lemmatized_words: #grab just the words, removing classes lemmatized_words = set(lemmatized[0] for lemmatized in lemmatized_words if lemmatized) #print("Soylemma Lemmatization of " + str(plain_word)) #print(lemmatized_words) for lemmatized_word in lemmatized_words: # check if mecab's input was classed as verb if we're adding da, # soylemma's classification somtimes wrongly ascribes as verb and adds da therefore # so, block lemmatization that added 다 in cases where mecab didn't call it a verb # but soylemma added 다 anyway to a word that didn't have it to begin with if block and ( not is_verb ) and lemmatized_word[-1] == "다" and plain_word[-1] != "다": #print("blocked soylemma's addition of 다 for word "+plain_word+" "+lemmatized_word) continue translations.extend( self.get_trans_fetch(lemmatized_word, original_word)) if translations: #print("Found trans with soylemma:") #print(translations) return translations return [] def list_unique(self, inlist): unique = [] for item in inlist: if item not in unique: unique.append(item) return unique def get_translations(self, original_branch, nextbranch=None, nextnextbranch=None): original_branch = self.remove_lead_trail_symbols(original_branch) if len(original_branch) == 0: return [] original_word = self.get_plain_word(original_branch) #try cutting off trailing particle morphs one by one branch = original_branch cutbranch = branch while True: translations = self.try_lemmatization_methods( cutbranch, original_word) if translations: #if len(cutbranch) < len(original_branch): #print("Found with cutparticle***\n" + str(original_branch) + ": " + str(cutbranch) + ": " + str(translations)) #else: #print("Found with original*\n" + str(original_branch) + ": " + str(cutbranch) + ": " + str(translations)) return translations cutbranch = self.remove_single_trailing_particle(branch) if len(cutbranch) < len(branch): branch = cutbranch else: break # at this point we should have either all trailing particles off or # if only started with all particles, a single particle chunk left translations = self.try_lemmatization_methods(branch, original_word) if translations: #print("Found**\n" + str(original_branch) + ": " + str(branch) + ": " + str(translations)) return translations #this might be handy if there are particles in the middle that are exposed after cuts branch_wo_end_particles = self.remove_all_trailing_particles(branch) translations = self.try_lemmatization_methods(branch_wo_end_particles, original_word) if translations: #print("Found with branchwoendparticles*********\n" + str(original_branch) + ": " + str(branch_wo_end_particles) + ": " + str(translations)) return translations main_chunks = self.get_main_chunks(branch_wo_end_particles, ['N', 'V', 'M']) main_chunks.extend(self.get_main_chunks(branch_wo_end_particles, ['M'])) main_chunks.extend(self.get_main_chunks(branch_wo_end_particles, ['V'])) main_chunks.extend(self.get_main_chunks(branch_wo_end_particles, ['N'])) main_chunks_unique = self.list_unique(main_chunks) main_chunks_NVM_largest_sorted = sorted(main_chunks_unique, key=self.get_plain_word_len) main_chunks_NVM_largest_sorted.reverse() if main_chunks_NVM_largest_sorted: for main_chunk in main_chunks_NVM_largest_sorted: #print("-"+str(main_chunk)) chunktrans = self.try_lemmatization_methods( main_chunk, original_word) if chunktrans: #print("Found with parsed chunks********************************\n" + str(original_branch) + ": " + str(main_chunk) + ": " + str(translations)) translations.extend(chunktrans) continue #try adding da if it's a verb if main_chunk and len(main_chunk[0]) > 1 and len( main_chunk[0][1]) > 0 and main_chunk[0][1][0] == 'V': main_chunk_da = main_chunk main_chunk_da.append(("다", "EC")) translations = self.try_lemmatization_methods( main_chunk_da, original_word) if translations: #print("Found with 다 addition*************************************\n" + str(original_branch) + ": " + str(main_chunk_da) + ": " + str(translations)) continue #take a morph off if len(main_chunk) > 1: shortchunk = main_chunk[0:-1] translations = self.try_lemmatization_methods( shortchunk, original_word) if translations: #print("Found with morph off end*****************************************\n" + str(original_branch) + ": " + str(shortchunk) + ": " + str(translations)) continue # take two off if len(main_chunk) > 2: shortchunk = main_chunk[0:-2] translations = self.try_lemmatization_methods( shortchunk, original_word) if translations: #print("Found with 2 morph off end*****************************************\n" + str(original_branch) + ": " + str(shortchunk) + ": " + str(translations)) continue # take char off end of total chunk flattened if main_chunk and len(main_chunk[0]) > 1: firstclass = main_chunk[0][1] word = self.get_plain_word(main_chunk) word = word[0:-1] translations = self.try_lemmatization_methods( [(word, firstclass)], original_word) if translations: #print("Found with char off end*****************************************\n" + str(original_branch) + ": " + str([(word, firstclass)]) + ": " + str(translations)) continue translations = self.list_unique(translations) if translations: return translations ##one last-ditch try at unblocking soylemma's addition of da on end of verbs ##that it disagrees with mecab on #translations = self.try_lemmatization_methods(branch, original_word, False) #if translations: #print("Found with ublocked soylemma*********************************************\n" + str(original_branch) + ": " + str(branch) + ": " + str(translations)) #return translations #translations = self.try_lemmatization_methods(branch_wo_end_particles, original_word, False) #if translations: #print("Found with ublocked soylemma*********************************************\n" + str(original_branch) + ": " + str(branch_wo_end_particles) + ": " + str(translations)) #return translations return [] def get_main_chunks(self, branch, classes=['M', 'N', 'I', 'V']): inside = False main_chunks = [] main_chunk = [] for i in range(0, len(branch)): morph = branch[i] if len(morph) > 1 and len(morph[1]) > 0 and morph[1][0] in classes: inside = True main_chunk.append(morph) else: if inside: main_chunks.append(main_chunk) main_chunk = [] inside = False if inside: main_chunks.append(main_chunk) return main_chunks def get_plain_branch(self, branch): if branch == None: return None without_symbols = [ x for x in branch if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S" ] return without_symbols def remove_lead_trail_symbols(self, branch): out = branch while out and len(out[-1]) > 1 and out[-1][1][0] == 'S': out = out[0:-1] while out and len(out[0]) > 1 and out[0][1][0] == 'S': del out[0] return out def remove_all_trailing_particles(self, branch): out = branch while len(out) > 1 and len( out[-1]) > 1 and out[-1][1][0] in self.particle_classes: out = out[0:-1] return out def remove_single_trailing_particle(self, branch): if len(branch) > 1 and len( branch[-1]) > 1 and branch[-1][1][0] in self.particle_classes: return branch[0:-1] return branch def get_plain_word(self, branch): if branch == None: return None without_symbols = [ (x[0] if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S" else "") for x in branch ] return "".join(without_symbols) def get_plain_word_len(self, branch): if branch == None: Exception("NoneType branch") return sum([(len(x[0]) if len(x) > 1 and len(x[1]) > 0 and x[1][0] != "S" else 0) for x in branch]) def end_of_sentence_punc(self, word): for morph in word: if len(morph) > 1 and len( morph[1]) > 0 and morph[1][0] == 'S' and ( "." in morph[0] or "?" in morph[0] or "!" in morph[0]): return True return False def format_passage(self, passage): #print("call on: ") #print_tree(passage) #print(type(passage)) if type(passage) is list: #then presumably we have a passage here, but check it's not empty or weird if passage: if type(passage[0]) is list: # we have a list of lists, assume passage self.xprint('<div class=wrapper>') self.xprint(' <ol class=sentence>') lenpassage = len(passage) #use this to track where found end of sentence punctuations laststartingpoint = 0 for i in range(0, lenpassage): # grab the word we're working on and also the following words # for various matching purposes word = passage[i] nextword = passage[i + 1] if i < lenpassage - 1 else None nextnextword = passage[ i + 2] if i < lenpassage - 1 - 1 else None # print the passage translation if we're at # end of a sentence or end of the passage if i == lenpassage - 1 or ( i > 0 and self.end_of_sentence_punc(word)): trans_link_passage = passage[laststartingpoint:i + 1] laststartingpoint = i + 1 else: trans_link_passage = None self.format_word(word, nextword, nextnextword, trans_link_passage) self.xprint(' </ol>') self.xprint('</div>') else: print("error, unexpected passage type" + str(type(passage))) elif type(passage) is str: self.xprint('<div class=wrapper>') if passage != "": self.xprint(' <ol class=sentence>') self.xprint(' <li>') self.xprint(' <ol class=comment>') self.xprint( ' <li lang=en_MORPH style="color: var(--page-Noun);">' ) self.xprint(html.escape(passage)) self.xprint(' </li>') self.xprint(' </ol>') self.xprint(' </li>') self.xprint(' </ol>') self.xprint('</div>') else: print("error, expected list or str, got something else") def format_word(self, branch, nextbranch=None, nextnextbranch=None, trans_link_passage=None): # example: branch = [('“', 'SSO'), ('톱질', 'NNG'), ('하', 'XSV'), ('세', 'EC')] non_symbol_branch = self.get_plain_branch(branch) plain_word = self.get_plain_word(branch) full_word = "".join([("<span style='color: var(--page-" + self.get_sejongtagset_superclass(x[1]) + ");'>" + html.escape(x[0]) + "</span>") for x in branch]) full_word_length = sum(len(x[0]) for x in branch) transliteration = '<span style="color: var(--page-InterInfo)">' + html.escape( self.transliterator.translit(plain_word)) + "</span>" krdictlink = ( '<a title="KRDict" class=diclink target="_blank" ' + 'rel="noopener noreferrer" href="' + "https://krdict.korean.go.kr/eng/smallDic/searchResult?nation=eng&nationCode=6&ParaWordNo=&mainSearchWord=" + urllib.parse.quote(plain_word) + f'">{self.pointer_char} KRDict</a>') naverlink = ( ' <a title="Naver" class=diclink target="_blank" ' + 'rel="noopener noreferrer" href="' + "https://en.dict.naver.com/#/search?query=" + urllib.parse.quote(plain_word) + f'">{self.pointer_char} Naver</a><br>') daumlink1 = ('<a title="Daum" class=diclink target="_blank" ' + 'rel="noopener noreferrer" href="' + "https://small.dic.daum.net/search.do?q=" + urllib.parse.quote(plain_word) + "&dic=eng" + f'">{self.pointer_char} Daum</a>') daumlink2 = ( ' <a title="Phrases" class=diclink target="_blank" ' + 'rel="noopener noreferrer" href="' + "https://small.dic.daum.net/search.do?q=" + urllib.parse.quote(plain_word) + "&dic=ee" + f'">{self.pointer_char} Phrases</a>') pos_info = "-".join( [self.get_sejongtagset_abbrev(x[1]) for x in non_symbol_branch]) pos_info_long = "\n".join([ ("<span style='color: var(--page-" + self.get_sejongtagset_superclass(x[1]) + ");'>" + html.escape(x[0]) + " " + self.get_sejongtagset_name(x[1]) + "</span>") for x in non_symbol_branch ]) grammarlink_matches = self.kogrammarlinks.search(branch, nextbranch) grammarlink_matches_html = '' if grammarlink_matches: grammarlink_matches_html = '<hr><p>' #pos_info = pos_info + "﹡" for glkey, gldef, glsource, gllink in grammarlink_matches: if glsource == "KRDict search" or glsource == "KRDict" or glsource == "YUF": grammarlink_matches_html = ( grammarlink_matches_html + '<a class=diclink onclick="particlesearch(\'' + glkey.lstrip("-~") + '\'); return false;" href="#">' + html.escape(glkey + ' ' + gldef) + ' ' + self.pointer_char + ' ' + glsource + '</a><br>') else: grammarlink_matches_html = ( grammarlink_matches_html + '<a class=diclink target="_blank" rel="noopener noreferrer" href="' + gllink + '">' + html.escape(glkey + ' ' + gldef) + ' ' + self.pointer_char + ' ' + glsource + '</a><br>') grammarlink_matches_html = grammarlink_matches_html + '</p>' translations = self.get_translations(branch, nextbranch, nextnextbranch) if translations: translations_html = ('<hr><p>' + html.escape(";\n".join(translations)) + '</p>') else: translations_html = '' phrase_translations = self.fetch_phrase_translations( plain_word, self.get_plain_word(nextbranch), self.get_plain_word(nextnextbranch)) if phrase_translations: phrase_translations_html = ( '<hr><p style="font-weight:bold;color: var(--page-Noun);">' + html.escape(";\n".join(phrase_translations)) + '</p>') else: phrase_translations_html = '' if trans_link_passage and trans_link_passage[0] and type( trans_link_passage[0][0]) is tuple: trans_link_passage_text_encoded = urllib.parse.quote(" ".join( "".join(tup[0] for tup in twig) for twig in trans_link_passage)) trans_link_passage_html = f'<a class=diclink target="_blank" rel="noopener noreferrer" href="https://papago.naver.com/?sk=ko&tk=en&st={trans_link_passage_text_encoded}">{self.pointer_char} passage Papago</a><br><a class=diclink target="_blank" rel="noopener noreferrer" href="https://translate.google.com/?sl=ko&tl=en&text={trans_link_passage_text_encoded}&op=translate">{self.pointer_char} passage GTrans</a><hr>' full_word = full_word + ' <span style="color: var(--page-Particle); font-weight: normal;">' + self.pointer_char + "</span>" else: trans_link_passage_html = "" total_lines_remain = 4 self.wrapper.width = round(full_word_length * 3.5) if full_word_length > 1 else 2 * 3 if phrase_translations: tshort_t = ";\n".join(phrase_translations) self.wrapper.max_lines = 2 phrase_translations_html_short = ( '<p style="font-weight:bold;color: var(--page-Noun)">' + html.escape(self.wrapper.fill(tshort_t)).replace( "\n", "<BR>") + '</p>') total_lines_remain = total_lines_remain - self.wrapper.max_lines else: phrase_translations_html_short = "" if translations: self.wrapper.max_lines = total_lines_remain translations_html_short = html.escape( self.wrapper.fill("; ".join(translations))).replace( "\n", "<BR>") #.replace(": ", ": ") else: translations_html_short = "—" description = (html.escape(pos_info) + "<p>" + translations_html_short + phrase_translations_html_short + "</p>") tooltip = (trans_link_passage_html + transliteration + '<hr>' + krdictlink + naverlink + daumlink1 + daumlink2 + '<hr><p>' + pos_info_long + '</p>' + translations_html + phrase_translations_html + grammarlink_matches_html).replace("\n", "<br>") self.xprint(' <li>') self.xprint( f' <ol class=word onclick="showtooltip(event, \'tipnumber{self.tipnumber}\')">' ) self.xprint(f' <li lang=es>{full_word}</li>') self.xprint( f' <li lang=en_MORPH class=tooltip>{description}<span class=tooltiptext id="tipnumber{self.tipnumber}">{tooltip}</span></li><br>' ) self.xprint(' </ol>') self.xprint(' </li>') self.tipnumber = self.tipnumber + 1 #sys.stdout.flush() def xprint(self, message): self.output.append(message) def generate(self): structured_content = [] for j in range(len(self.content)): line = self.content[j] if line == "": structured_content.append(line) elif line[0] == "#": structured_content.append(line[1:]) else: # see https://pypi.org/project/python-mecab-ko/ structured_content.append(self.mecab.pos(line)) self.tipnumber = 0 for passage in structured_content: self.format_passage(passage) self.cur.close() self.con.close() return "\n".join(self.output)
def hangul_romanize(text): transliter = Transliter(academic) return transliter.translit(text)