def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split()
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False))


#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else:  # Mostly european languages
        words = sent.split()

    return words
Example #2
0
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words
Example #3
0
def analyze_sentence(sentence, is_thai, dictionary):
    se = etree.Element(u'se')
    if is_thai:
        tokens = pythai.split(sentence)
        for token in tokens:
            if token:
                word = etree.Element(u'w')
                if token in dictionary:
                    for analysis_number in dictionary[token]:
                        analysis = dictionary[token][analysis_number]
                        ana = etree.Element(u'ana')
                        ana.attrib[u'lex'] = token
                        ana.attrib[u'pos'] = u','.join(analysis[1])
                        ana.attrib[u'trans'] = analysis[0]
                        ana.attrib[u'translit'] = analysis[2]
                        word.append(ana)
                word.text = token
                se.append(word)
    else:
        sentence = u' '.join(sentence.split())
        sentence = sentence.replace(u'\t', u'')
        sentence = sentence.replace(u'\r\n', u'')
        sentence = sentence.replace(u'\n', u'')
        if sentence:
            word = etree.Element(u'w')
            ana = etree.Element(u'ana')
            ana.attrib[u'lex'] = u''
            ana.attrib[u'pos'] = u''
            ana.attrib[u'trans'] = u''
            ana.attrib[u'translit'] = u''
            word.append(ana)
            word.text = sentence
            se.append(word)
    return se
Example #4
0
def tag_text(text, dictionary): ##hmmm...
    #print u"###", text
    result = [u"<body>"]
    sents = text.split()
    for i in sents:
        result.append(u"<se>")
        for j in pythai.split(i):
            result.append(tag_word(j, dictionary))
        result.append(u"</se>")
    result.append(u"</body>")
    return create_xml(result)
Example #5
0
def tag_text(text, dictionary):  ##hmmm...
    #print u"###", text
    result = [u"<body>"]
    sents = text.split()
    for i in sents:
        result.append(u"<se>")
        for j in pythai.split(i):
            result.append(tag_word(j, dictionary))
        result.append(u"</se>")
    result.append(u"</body>")
    return create_xml(result)
Example #6
0
 def test_split(self):
     for sentence in self.test_sentences:
         six.print_(sentence.split,
                    ' '.join(pythai.split(sentence.sentence)))
         self.assertEqual(' '.join(pythai.split(sentence.sentence)),
                          sentence.split)
Example #7
0
 def test_split(self):
     for sentence in self.test_sentences:
         print sentence.split, ' '.join(pythai.split(sentence.sentence))
         self.assertEqual(' '.join(pythai.split(sentence.sentence)), sentence.split)
Example #8
0
def pos(string):
    tokens = pythai.split(string)
    return tokens
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pythai

print pythai.split(u"การที่ได้ต้องแสดงว่างานดี")
print pythai.split(u"ฉันกินข้าว")
print pythai.split(u"ฉันwantกินseeข้าว")
Example #10
0
import pythai

print pythai.split(u"การที่ได้ต้องแสดงว่างานดี")
def textToSegmentedList(sentence):
    # change later
    #return sentence.split(u' ')
    return pythai.split(sentence)
def pythai_split(u, limit=1):
    """ 
    Using PyThai to split thai words
    """
    return pythai.split(u)
Example #13
0
def token_iterator(sentence):
    tokens = pythai.split(sentence)
    for token in tokens:
        yield token