def segment(text, data=None): if not data: wordcut = Wordcut.bigthai() else: word_list = list(set(data)) wordcut = Wordcut(word_list) return wordcut.tokenize(text)
def segment(text,data=""): if data=="": wordcut = Wordcut.bigthai() else: word_list = list(set(data)) wordcut = Wordcut(word_list) return wordcut.tokenize(text)
def segment(text): wordcut = Wordcut.bigthai() return wordcut.tokenize(text)
from pythainlp.tokenize import word_tokenize data = getdata() '''cut="\n".join(["|".join(word_tokenize(i))+"|" for i in data]) save(cut,"p1") from testcut import cutok as cut1 from testcut2 import cutok as cut2 from testcut3 import cutok as cut3 from testcut4 import cutok as cut4 cut="\n".join([cut1(i)+"|" for i in data]) save(cut,"p2") cut="\n".join([cut2(i)+"|" for i in data]) save(cut,"p3") cut="\n".join([cut3(i)+"|" for i in data]) save(cut,"p4") cut="\n".join([cut4(i)+"|" for i in data]) save(cut,"p5") cut="\n".join(["|".join(word_tokenize(i,engine="ulmfit"))+"|" for i in data]) save(cut,"p6") cut="\n".join(["|".join(word_tokenize(i,engine="longest"))+"|" for i in data]) save(cut,"p7") cut="\n".join(["|".join(word_tokenize(i,engine="mm"))+"|" for i in data]) save(cut,"p8") cut="\n".join(["|".join(word_tokenize(i,engine="icu"))+"|" for i in data]) save(cut,"p9")''' from wordcut import Wordcut wordcut = Wordcut.bigthai() cut = "\n".join(["|".join(wordcut.tokenize(i)) + "|" for i in data]) save(cut, "p11") cut = "\n".join( ["|".join(word_tokenize(i, engine="deepcut")) + "|" for i in data]) save(cut, "p10")
def default_segment(cls, inp): '''Segment an input with default model (bigthai)''' tokens = wordcut.bigthai().tokenize(inp) tokens = clean(tokens) return ' '.join(tokens)
def get_default_model(cls): '''Get default tokeniser model (bigthai)''' return wordcut.bigthai()
def setUp(self): self.wordcut = Wordcut.bigthai()