def compute_blocks(txt, method='syllable'): """ Compute blocks for word based on regular expressions Generates a dictionary of (block, number) pairs, where block is the string representation of the block and number is the number of times this block occurs in txt. Returns only the blocks. """ blocks = defaultdict(int) if method == 'word': regexp = _u('บัตร|ประ|จำ|ตัว') elif method == 'syllable': regexp = _u('(' + r8 + '|' + r7 + '|' + r3 + '|' + r6 + '|' + r4 + '|' + r5 + ')') else: raise ValueError('Unsupported method: {}'.format(method)) regex = re.compile(regexp) patterns = regex.finditer(_u(txt)) for i in patterns: block = Block(_u(i.group())) blocks[block] += 1 # compute block meanings blocks = DbUtil().compute_block_meanings(blocks) return blocks.keys()
def compute_block_meanings(self, blocks): for line in self.lines: for block in blocks: if _u(block.string) == line.split(',')[0]: meaning = line.split(',')[1].strip() block.meaning = meaning return blocks
def compute_blocks_re(txt): """Compute blocks for word based on regular expressions Generates a dictionary of (block, number) pairs, where block is the string representation of the block and number is the number of times this block occurs in txt. Returns only the blocks.""" import re METHOD = 'syllable' blocks = {} if METHOD == 'word': regexp = _u('บัตร|ประ|จำ|ตัว') elif METHOD == 'syllable': # consonants incl. initial silent consonants c1 = '({cc}|ห{lc}|อย|{c})'.format(lc=LC, c=C, cc=CC) # initial consonant + vowel (upper/lower/trailing/inherent) + final consonant r3 = _u('({c1}{c}?({c}{{0}}|({l}|{u})?{t}?|{t}?{r}|{t}?อ){z}({s})?)'.format(c=C, c1=c1,u=U,l=L,r=R1,t=T,z=Z,s=S)) # initial consonant + vowel (upper/lower/trailing) r4 = _u('({c}({t}?อ|({l}|{u}){t}?|{t}?{r})์?)'.format(c=c1, u=U1, l=L, r=R, t=T)) # leading vowel r5 = _u('({f}{c}(็?{t}?{z}|{t}?{r}?|{u}?{t}?{z}))'.format(f=F,c=c1,t=T,r=R,u=U,z=Z)) # sara ia and sara uea r6 = _u('เ{c}(ี{t}?ย|ื{t}?อ)({z}|ว)?'.format(c=c1,t=T,z=Z)) # er, o, ao r7 = _u('({}{}(อ|า)ะ?)'.format(F,c1)) ## double r r8 = _u('({c}รร{z}?)'.format(c=c1,z=Z)) regexp = _u('('+ r8 + '|' + r7 + '|' + r3 + '|' + r6 + '|' + r4 + '|' + r5 + ')') regex = re.compile(regexp) patterns = regex.finditer(_u(txt)) for i in patterns: block = Block(_u(i.group())) if block in blocks: blocks[block] += 1 else: blocks[block] = 1 # compute block meanings blocks = compute_block_meanings(blocks) return blocks.keys()
def compute_block_meanings(blocks): f = codecs.open(FILE, 'r', encoding='utf-8') lines = f.readlines() f.close() for line in lines: for block in blocks: if _u(block.string) == line.split(',')[0]: meaning = line.split(',')[1].strip() block.meaning = meaning return blocks
def get_words_with_block(block, exclude=None): """ This functions returns a list of Thai words that rely on the same block. """ f = codecs.open(FILE, 'r', encoding='utf-8') lines = f.readlines() f.close() words = [] for line in lines: if _u(block.string) in line.split(',')[0]: word = line.split(',')[0].strip() if word != exclude: words.append(ThaiWord(string=word, \ meaning=line.split(',')[1].strip())) return words
def __init__(self, string='', ethym=None, meaning=None, compute_ethym=False): self.string = _u(string) # e.g. user input string self.language = 'Korean' self.db_util = DbUtil() if ethym and meaning: assert(len(string) == len(ethym)) # to the best of my knowledge a # Korean word and its hanja # representation (when existing) # have the same lengths self.blocks = [[Block(string[i], ethym=ethym[i]) for i in range(len(string))]] self.meanings = [meaning] self.selected_meaning = 0 # the word is clearly defined else: self.compute_suffix() self.blocks = self.compute_blocks(compute_ethym) self.meanings = self.db_util.compute_meanings(self.string_without_suffix) # Different meanings in English self.selected_meaning = 0 # index of the selected meaning
def __init__(self, string='', etymology=None, meaning=None, compute_etymology=False): self.check_init_parameters(string, etymology, meaning) self.string = _u(string) # e.g. user input string self.db_util = DbUtil() if etymology and meaning: self.blocks = [[ Block(string[i], etymology=etymology[i]) for i in range(len(string)) ]] self.meanings = [meaning] self.selected_meaning = 0 # the word is clearly defined else: self.compute_suffix() self.blocks = self.compute_blocks(compute_etymology) self.meanings = self.db_util.compute_meanings( self.string_without_suffix) # Different meanings in English self.selected_meaning = 0 # index of the selected meaning
#cgitb.enable() #UI.render_empty() # Entree UI.render_top() # Main form = cgi.FieldStorage() if 'word' in form.keys(): input_str = form["word"].value try: language = detect_language(_u(input_str)) except ValueError: language = None UI.render_error('Language not supported') if language == 'korean': from asian_word_analyzer.korean.db import DbUtil from asian_word_analyzer.korean.word import KoreanWord as Word elif language == 'thai': from asian_word_analyzer.thai.word import get_words_with_block from asian_word_analyzer.thai.word import ThaiWord as Word if 'Word' in locals(): word = Word(input_str, compute_ethym=True) UI.render_main(word)
F = '[เ-ไ]' # 'เ,แ,ไ,ใ,โ' # trailing vowels R = '[ะาำๅ]' # ะ,า,◌ำ, R1 = '[าำๅ]' # า,◌ำ,ๅ # upper vowels U = '[ัิีึื]' # ◌ั, ◌ิ, ◌ี, ◌ึ, ◌ื,◌ U1 = '[ิีึื]' # ◌ิ, ◌ี, ◌ึ, ◌ื,◌ # lower vowels L = '[ุู]' # ◌ุ,◌ู,◌ # tonal marks T = '[่-๋]' # final consonants (not followed by tonal mark or vowel) Z = '[กขฃคฅฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมรฤลฦศษสฬวย](?!{t}|{u}|{l}|{r})'.format(t=T, u=U, l=L,r=R) # silent syllables S = '{c}({u}|{l})?{t}?์{{1}}'.format(c=C, u=U, l=L, t=T) # consonants incl. initial silent consonants c1 = '({cc}|ห{lc}|อย|{c})'.format(lc=LC, c=C, cc=CC) # initial consonant + vowel (upper/lower/trailing/inherent) + final consonant r3 = _u('({c1}{c}?({c}{{0}}|({l}|{u})?{t}?|{t}?{r}|{t}?อ){z}({s})?)'.format(c=C, c1=c1, u=U, l=L, r=R1, t=T, z=Z, s=S)) # initial consonant + vowel (upper/lower/trailing) r4 = _u('({c}({t}?อ|({l}|{u}){t}?|{t}?{r})์?)'.format(c=c1, u=U1, l=L, r=R, t=T)) # leading vowel r5 = _u('({f}{c}(็?{t}?{z}|{t}?{r}?|{u}?{t}?{z}))'.format(f=F, c=c1, t=T, r=R, u=U, z=Z)) # sara ia and sara uea r6 = _u('เ{c}(ี{t}?ย|ื{t}?อ)({z}|ว)?'.format(c=c1, t=T, z=Z)) # er, o, ao r7 = _u('({}{}(อ|า)ะ?)'.format(F, c1)) # double r r8 = _u('({c}รร{z}?)'.format(c=c1, z=Z))
def test__u(input_string): assert isinstance(_u(input_string), str)
def get_word_and_db_util_classes(language): if language == 'korean': return KoreanDbUtil, KoreanWord elif language == 'thai': return ThaiDbUtil, ThaiWord # Entree UI.render_top() # Main form = cgi.FieldStorage() if 'word' in form.keys(): input_str = form["word"].value try: language = detect_language(_u(input_str)) except ValueError: UI.render_error('Language not supported') if 'language' in locals(): DbUtil, Word = get_word_and_db_util_classes(language) word = Word(input_str, compute_etymology=True) UI.render_main(word) blocks = word.get_blocks_for_selected_meaning() for block in blocks: word_tuples = DbUtil().get_words_with_block(block, exclude=word) words = [ Word(string=r[0], etymology=r[1], meaning=r[2]) for r in word_tuples ] UI.render_block(block, words)