Example #1
0
    def compute_blocks(txt, method='syllable'):
        """
        Compute blocks for word based on regular expressions
        
        Generates a dictionary of (block, number) pairs, where block is the string representation of 
        the block and number is the number of times this block occurs in txt.
        
        Returns only the blocks.
        """

        blocks = defaultdict(int)

        if method == 'word':
            regexp = _u('บัตร|ประ|จำ|ตัว')
        elif method == 'syllable':
            regexp = _u('(' + r8 + '|' + r7 + '|' + r3 + '|' + r6 + '|' + r4 +
                        '|' + r5 + ')')
        else:
            raise ValueError('Unsupported method: {}'.format(method))

        regex = re.compile(regexp)
        patterns = regex.finditer(_u(txt))

        for i in patterns:

            block = Block(_u(i.group()))
            blocks[block] += 1

        # compute block meanings
        blocks = DbUtil().compute_block_meanings(blocks)

        return blocks.keys()
Example #2
0
 def compute_block_meanings(self, blocks):
     for line in self.lines:
         for block in blocks:
             if _u(block.string) == line.split(',')[0]:
                 meaning = line.split(',')[1].strip()
                 block.meaning = meaning
     return blocks
Example #3
0
def compute_blocks_re(txt):
    """Compute blocks for word based on regular expressions
       Generates a dictionary of (block, number) pairs,
       where block is the string representation of the block
       and number is the number of times this block occurs in
       txt.
       Returns only the blocks."""
    import re

    METHOD = 'syllable'

    blocks = {}

    if METHOD == 'word':
        regexp = _u('บัตร|ประ|จำ|ตัว')
    elif METHOD == 'syllable':
        # consonants incl. initial silent consonants 
        c1 = '({cc}|ห{lc}|อย|{c})'.format(lc=LC, c=C, cc=CC)
        # initial consonant + vowel (upper/lower/trailing/inherent) + final consonant
        r3 = _u('({c1}{c}?({c}{{0}}|({l}|{u})?{t}?|{t}?{r}|{t}?อ){z}({s})?)'.format(c=C, c1=c1,u=U,l=L,r=R1,t=T,z=Z,s=S))
        # initial consonant + vowel (upper/lower/trailing) 
        r4 = _u('({c}({t}?อ|({l}|{u}){t}?|{t}?{r})์?)'.format(c=c1, u=U1, l=L, r=R, t=T))
        # leading vowel
        r5 = _u('({f}{c}(็?{t}?{z}|{t}?{r}?|{u}?{t}?{z}))'.format(f=F,c=c1,t=T,r=R,u=U,z=Z))
        # sara ia and sara uea
        r6 = _u('เ{c}(ี{t}?ย|ื{t}?อ)({z}|ว)?'.format(c=c1,t=T,z=Z))
        # er, o, ao
        r7 = _u('({}{}(อ|า)ะ?)'.format(F,c1))
        ## double r
        r8 = _u('({c}รร{z}?)'.format(c=c1,z=Z))
        regexp = _u('('+ r8 + '|' + r7 + '|' + r3 + '|' + r6 + '|' + r4 + '|' + r5 + ')')

    regex = re.compile(regexp)
    patterns = regex.finditer(_u(txt))

    for i in patterns:

        block = Block(_u(i.group()))
        if block in blocks:
            blocks[block] += 1
        else:
            blocks[block] = 1

    # compute block meanings
    blocks = compute_block_meanings(blocks)

    return blocks.keys()
Example #4
0
def compute_block_meanings(blocks):
    f = codecs.open(FILE, 'r', encoding='utf-8')
    lines = f.readlines()
    f.close()
    for line in lines:
        for block in blocks:
            if _u(block.string) == line.split(',')[0]:
                meaning = line.split(',')[1].strip()
                block.meaning = meaning
    return blocks
Example #5
0
def get_words_with_block(block, exclude=None):
    """
    This functions returns a list of Thai words that rely on the same block.
    """    
    f = codecs.open(FILE, 'r', encoding='utf-8')
    lines = f.readlines()
    f.close()
    words = []
    for line in lines:
        if _u(block.string) in line.split(',')[0]:
            word = line.split(',')[0].strip()
            if word != exclude:
                words.append(ThaiWord(string=word, \
                                        meaning=line.split(',')[1].strip()))

    return words
Example #6
0
    def __init__(self, string='', ethym=None, meaning=None, compute_ethym=False):
        self.string = _u(string)  # e.g. user input string
        self.language = 'Korean'
        self.db_util = DbUtil()

        if ethym and meaning:
            assert(len(string) == len(ethym)) # to the best of my knowledge a
                                              # Korean word and its hanja
                                              # representation (when existing)
                                              # have the same lengths
            self.blocks = [[Block(string[i], ethym=ethym[i]) for i in range(len(string))]]
            self.meanings = [meaning]
            self.selected_meaning = 0 # the word is clearly defined

        else:
            self.compute_suffix()
            self.blocks = self.compute_blocks(compute_ethym)
            self.meanings = self.db_util.compute_meanings(self.string_without_suffix) # Different meanings in English
            self.selected_meaning = 0 # index of the selected meaning
Example #7
0
    def __init__(self,
                 string='',
                 etymology=None,
                 meaning=None,
                 compute_etymology=False):
        self.check_init_parameters(string, etymology, meaning)
        self.string = _u(string)  # e.g. user input string
        self.db_util = DbUtil()

        if etymology and meaning:
            self.blocks = [[
                Block(string[i], etymology=etymology[i])
                for i in range(len(string))
            ]]
            self.meanings = [meaning]
            self.selected_meaning = 0  # the word is clearly defined

        else:
            self.compute_suffix()
            self.blocks = self.compute_blocks(compute_etymology)
            self.meanings = self.db_util.compute_meanings(
                self.string_without_suffix)  # Different meanings in English
            self.selected_meaning = 0  # index of the selected meaning
Example #8
0

#cgitb.enable()

#UI.render_empty()

# Entree
UI.render_top()


# Main
form = cgi.FieldStorage()
if 'word' in form.keys():
    input_str = form["word"].value
    try:    
        language = detect_language(_u(input_str))
    except ValueError:
        language = None
        UI.render_error('Language not supported')
        

    if language == 'korean':
        from asian_word_analyzer.korean.db import DbUtil
        from asian_word_analyzer.korean.word import KoreanWord as Word
    elif language == 'thai':
        from asian_word_analyzer.thai.word import get_words_with_block
        from asian_word_analyzer.thai.word import ThaiWord as Word

    if 'Word' in locals():
        word = Word(input_str, compute_ethym=True)
        UI.render_main(word)
Example #9
0
F = '[เ-ไ]'  # 'เ,แ,ไ,ใ,โ'
# trailing vowels
R = '[ะาำๅ]'  # ะ,า,◌ำ,
R1 = '[าำๅ]'  # า,◌ำ,ๅ
# upper vowels
U = '[ัิีึื]'  # ◌ั, ◌ิ, ◌ี, ◌ึ, ◌ื,◌
U1 = '[ิีึื]'  # ◌ิ, ◌ี, ◌ึ, ◌ื,◌
# lower vowels
L = '[ุู]'  # ◌ุ,◌ู,◌
# tonal marks
T = '[่-๋]'
# final consonants (not followed by tonal mark or vowel)
Z = '[กขฃคฅฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมรฤลฦศษสฬวย](?!{t}|{u}|{l}|{r})'.format(t=T, u=U, l=L,r=R)
# silent syllables
S = '{c}({u}|{l})?{t}?์{{1}}'.format(c=C, u=U, l=L, t=T)
# consonants incl. initial silent consonants 
c1 = '({cc}|ห{lc}|อย|{c})'.format(lc=LC, c=C, cc=CC)
# initial consonant + vowel (upper/lower/trailing/inherent) + final consonant
r3 = _u('({c1}{c}?({c}{{0}}|({l}|{u})?{t}?|{t}?{r}|{t}?อ){z}({s})?)'.format(c=C, c1=c1, u=U, l=L,
                                                                            r=R1, t=T, z=Z, s=S))
# initial consonant + vowel (upper/lower/trailing) 
r4 = _u('({c}({t}?อ|({l}|{u}){t}?|{t}?{r})์?)'.format(c=c1, u=U1, l=L, r=R, t=T))
# leading vowel
r5 = _u('({f}{c}(็?{t}?{z}|{t}?{r}?|{u}?{t}?{z}))'.format(f=F, c=c1, t=T, r=R, u=U, z=Z))
# sara ia and sara uea
r6 = _u('เ{c}(ี{t}?ย|ื{t}?อ)({z}|ว)?'.format(c=c1, t=T, z=Z))
# er, o, ao
r7 = _u('({}{}(อ|า)ะ?)'.format(F, c1))
# double r
r8 = _u('({c}รร{z}?)'.format(c=c1, z=Z))
Example #10
0
def test__u(input_string):
    assert isinstance(_u(input_string), str)
Example #11
0
def get_word_and_db_util_classes(language):
    if language == 'korean':
        return KoreanDbUtil, KoreanWord
    elif language == 'thai':
        return ThaiDbUtil, ThaiWord


# Entree
UI.render_top()

# Main
form = cgi.FieldStorage()
if 'word' in form.keys():
    input_str = form["word"].value
    try:
        language = detect_language(_u(input_str))
    except ValueError:
        UI.render_error('Language not supported')

    if 'language' in locals():
        DbUtil, Word = get_word_and_db_util_classes(language)
        word = Word(input_str, compute_etymology=True)
        UI.render_main(word)
        blocks = word.get_blocks_for_selected_meaning()
        for block in blocks:
            word_tuples = DbUtil().get_words_with_block(block, exclude=word)
            words = [
                Word(string=r[0], etymology=r[1], meaning=r[2])
                for r in word_tuples
            ]
            UI.render_block(block, words)
Example #12
0
def test__u(input_string):
    assert isinstance(_u(input_string), str)