Ejemplo n.º 1
0
def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Ejemplo n.º 2
0
def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Ejemplo n.º 3
0
def segment(text,data=""):
    if data=="":
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Ejemplo n.º 4
0
def segment(text):
    wordcut = Wordcut.bigthai()
    return wordcut.tokenize(text)
Ejemplo n.º 5
0
from pythainlp.tokenize import word_tokenize
data = getdata()
'''cut="\n".join(["|".join(word_tokenize(i))+"|" for i in data])
save(cut,"p1")
from testcut import cutok as cut1
from testcut2 import cutok as cut2
from testcut3 import cutok as cut3
from testcut4 import cutok as cut4
cut="\n".join([cut1(i)+"|" for i in data])
save(cut,"p2")
cut="\n".join([cut2(i)+"|" for i in data])
save(cut,"p3")
cut="\n".join([cut3(i)+"|" for i in data])
save(cut,"p4")
cut="\n".join([cut4(i)+"|" for i in data])
save(cut,"p5")
cut="\n".join(["|".join(word_tokenize(i,engine="ulmfit"))+"|" for i in data])
save(cut,"p6")
cut="\n".join(["|".join(word_tokenize(i,engine="longest"))+"|" for i in data])
save(cut,"p7")
cut="\n".join(["|".join(word_tokenize(i,engine="mm"))+"|" for i in data])
save(cut,"p8")
cut="\n".join(["|".join(word_tokenize(i,engine="icu"))+"|" for i in data])
save(cut,"p9")'''
from wordcut import Wordcut
wordcut = Wordcut.bigthai()
cut = "\n".join(["|".join(wordcut.tokenize(i)) + "|" for i in data])
save(cut, "p11")
cut = "\n".join(
    ["|".join(word_tokenize(i, engine="deepcut")) + "|" for i in data])
save(cut, "p10")
Ejemplo n.º 6
0
 def default_segment(cls, inp):
     '''Segment an input with default model (bigthai)'''
     tokens = wordcut.bigthai().tokenize(inp)
     tokens = clean(tokens)
     return ' '.join(tokens)
Ejemplo n.º 7
0
 def get_default_model(cls):
     '''Get default tokeniser model (bigthai)'''
     return wordcut.bigthai()
Ejemplo n.º 8
0
 def setUp(self):
     self.wordcut = Wordcut.bigthai()