Python split Examples, pythai.split Python Examples

Example #1

0

Show file

File: build_corpus.py Project: godjealous/code_type_recognition

def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split()
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False))


#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else:  # Mostly european languages
        words = sent.split()

    return words

Example #2

0

Show file

File: build_corpus.py Project: wiwengweng/wordvectors

def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words

Example #3

0

Show file

File: tagger.py Project: malps13/thai-language

def analyze_sentence(sentence, is_thai, dictionary):
    se = etree.Element(u'se')
    if is_thai:
        tokens = pythai.split(sentence)
        for token in tokens:
            if token:
                word = etree.Element(u'w')
                if token in dictionary:
                    for analysis_number in dictionary[token]:
                        analysis = dictionary[token][analysis_number]
                        ana = etree.Element(u'ana')
                        ana.attrib[u'lex'] = token
                        ana.attrib[u'pos'] = u','.join(analysis[1])
                        ana.attrib[u'trans'] = analysis[0]
                        ana.attrib[u'translit'] = analysis[2]
                        word.append(ana)
                word.text = token
                se.append(word)
    else:
        sentence = u' '.join(sentence.split())
        sentence = sentence.replace(u'\t', u'')
        sentence = sentence.replace(u'\r\n', u'')
        sentence = sentence.replace(u'\n', u'')
        if sentence:
            word = etree.Element(u'w')
            ana = etree.Element(u'ana')
            ana.attrib[u'lex'] = u''
            ana.attrib[u'pos'] = u''
            ana.attrib[u'trans'] = u''
            ana.attrib[u'translit'] = u''
            word.append(ana)
            word.text = sentence
            se.append(word)
    return se

Example #4

0

Show file

File: thai_tagger.py Project: Tann11/thai-language

def tag_text(text, dictionary): ##hmmm...
    #print u"###", text
    result = [u"<body>"]
    sents = text.split()
    for i in sents:
        result.append(u"<se>")
        for j in pythai.split(i):
            result.append(tag_word(j, dictionary))
        result.append(u"</se>")
    result.append(u"</body>")
    return create_xml(result)

Example #5

0

Show file

File: old_thai_tagger.py Project: malps13/thai-language

def tag_text(text, dictionary):  ##hmmm...
    #print u"###", text
    result = [u"<body>"]
    sents = text.split()
    for i in sents:
        result.append(u"<se>")
        for j in pythai.split(i):
            result.append(tag_word(j, dictionary))
        result.append(u"</se>")
    result.append(u"</body>")
    return create_xml(result)

Example #6

0

Show file

 def test_split(self):
     for sentence in self.test_sentences:
         six.print_(sentence.split,
                    ' '.join(pythai.split(sentence.sentence)))
         self.assertEqual(' '.join(pythai.split(sentence.sentence)),
                          sentence.split)

Example #7

0

Show file

File: tests.py Project: Godlil2e/pythai

 def test_split(self):
     for sentence in self.test_sentences:
         print sentence.split, ' '.join(pythai.split(sentence.sentence))
         self.assertEqual(' '.join(pythai.split(sentence.sentence)), sentence.split)

Example #8

0

Show file

File: temp.py Project: malps13/thai-language

def pos(string):
    tokens = pythai.split(string)
    return tokens

Example #9

0

Show file

File: testPythai.py Project: benathi/TwitterCodeSwitching

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pythai

print pythai.split(u"การที่ได้ต้องแสดงว่างานดี")
print pythai.split(u"ฉันกินข้าว")
print pythai.split(u"ฉันwantกินseeข้าว")

Example #10

0

Show file

import pythai

print pythai.split(u"การที่ได้ต้องแสดงว่างานดี")

Example #11

0

Show file

File: Process1_pyThaiTokenization.py Project: benathi/TwitterCodeSwitching

def textToSegmentedList(sentence):
    # change later
    #return sentence.split(u' ')
    return pythai.split(sentence)

Example #12

0

Show file

File: BigramSplitter.py Project: RadioFreeAsia/Products.BigramSplitter-1.0

def pythai_split(u, limit=1):
    """ 
    Using PyThai to split thai words
    """
    return pythai.split(u)

Example #13

0

Show file

File: dummy_texts.py Project: malps13/thai-language

def token_iterator(sentence):
    tokens = pythai.split(sentence)
    for token in tokens:
        yield token