def test_corpus(self): self.assertEqual(alphabet.get_data() != None, True) self.assertEqual(country.get_data() != None, True) self.assertEqual(tone.get_data() != None, True) self.assertEqual(provinces.get_data() != None, True) self.assertEqual( len(newthaiword.get_data()) > len(thaiword.get_data()), True)
def file_trie(data): ''' ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie ''' path = os.path.join( os.path.expanduser("~"), 'pythainlp-data') #os.path.join(, 'pthainlp_trie.data') if not os.path.exists(path): os.makedirs(path) if data == "newmm": path = os.path.join(path, 'pythainlp_trie-tcc1.data') elif data == "old": path = os.path.join(path, 'pythainlp_trie2.data') else: path = os.path.join(path, 'pythainlp_trie2.data') if not os.path.exists(path): #ถ้าไม่มีไฟล์ if data == "newmm": from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า data2 = get_data() i = 0 while i < len(data2): data2[i] = tcc.tcc(data2[i], sep='#') if (data2[len(data2[i]) - 1] != "#"): data2[i] += "#" i += 1 data = data2 elif data == 'old': from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า data = get_data() else: from pythainlp.corpus.newthaiword import get_data # ข้อมูลใหม่ data = get_data() with open(path, 'wb') as dill_file: dill.dump(marisa_trie.Trie(data), dill_file) dill_file.close() with open(path, 'rb') as dill_file: data = dill.load(dill_file) dill_file.close() return data
def file_trie(data): ''' ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie ''' path = get_path_pythainlp_data() if not os.path.exists(path): os.makedirs(path) if data=="newmm": path = os.path.join(path, 'pythainlp_trie-tcc1.data') elif data=="old": path = os.path.join(path, 'pythainlp_trie2.data') else: path = os.path.join(path, 'pythainlp_trie2.data') if not os.path.exists(path): #ถ้าไม่มีไฟล์ if data=="newmm": from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า data2=get_data() i=0 while i<len(data2): data2[i]=tcc.tcc(data2[i],sep='#') if(data2[len(data2[i])-1]!="#"): data2[i]+="#" i+=1 data=data2 elif data=='old': from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า data=get_data() else: from pythainlp.corpus.newthaiword import get_data # ข้อมูลใหม่ data=get_data() with open(path,'wb') as dill_file: dill.dump(marisa_trie.Trie(data),dill_file) dill_file.close() with open(path,'rb') as dill_file: data=dill.load(dill_file) dill_file.close() return data
def tcut(text): #global last_p, i, q, ww # for debug trie = Trie(get_data()) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + '/' + path q = {0} last_p = 0 # last position for yield while min(q) < len(text): p = min(q) q -= {p} # q.pop, but for set for w in trie.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) if len(q) == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 # กรณี len(q) == 0 คือ ไม่มีใน dict if len(q) == 0: # skip น้อยที่สุด ที่เป็นไปได้ for i in range(p, len(text)): ww = trie.prefixes(text[i:]) if ww: break else: i = len(text) w = text[p:i] w = w.replace(' ', '') # ลบค่าที่ว่าง words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)
def tagger(sent): return bayes.classify(' '.join(mmcut(sent,[i[0] for i in patterns]+get_data())))
cัtวะ c[ัื]tc[ุิะ]? c[ิุู]์ c[ะ-ู]t c็ ct[ะาำ]? แc็c แcc์ แctะ แcc็c แccc์ โctะ [เ-ไ]ct """.replace('c', '[ก-ฮ]').replace('t', '[่-๋]?').split() THAI_WORDS = Trie(get_data()) def tcc(w): p = 0 pat = re.compile("|".join(pat_tcc)) while p < len(w): m = pat.match(w[p:]) if m: n = m.span()[1] else: n = 1 yield w[p:p + n] p += n
# -*- coding: utf-8 -*- from pythainlp.tokenize.newmm import mmcut from pythainlp.corpus.thaiword import get_data import simplebayes import nltk.tag import dill #from pythainlp.corpus.thaiword import get_data import nltk.tag sentences=input("text : ") with open('patterns-classify-word-thai.data', 'rb') as in_strm: patterns = dill.load(in_strm) in_strm.close() with open('bayes-classify-word-thai.data', 'rb') as in_strm: bayes = dill.load(in_strm) in_strm.close() with open('classify-word-thai.data', 'rb') as in_strm: tagger = dill.load(in_strm) in_strm.close() r=tagger(sentences) print(r) print(bayes.score(' '.join(mmcut(sentences,[i[0] for i in patterns]+get_data())))) # บอกความน่าจะเป็นของ tag
# -*- coding: utf-8 -*- from __future__ import absolute_import,division,unicode_literals,print_function from builtins import * # Longest matching # โค้ดจาก https://stackoverflow.com/a/11642687 from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า from math import log words=get_data() import re wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words)) maxword = max(len(x) for x in words) def segment(s): """ตัดคำภาษาไทยด้วย Longest matching""" # Find the best match for the i first characters, assuming cost has # been built for the i-1 first characters. # Returns a pair (match_cost, match_length). data = re.split(r'\n|\s+',s) # แยกช่องว่างและขึ้นประโยคใหม่ outall='' def best_match(i): candidates = enumerate(reversed(cost[max(0, i-maxword):i])) return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates) # Build the cost array. countlist=0 while countlist<len(data): s=data[countlist] cost = [0] for i in range(1,len(s)+1): c,k = best_match(i) cost.append(c) # Backtrack to recover the minimal-cost string.
# -*- coding: utf-8 -*- from __future__ import absolute_import, division, unicode_literals, print_function from builtins import * ''' ตัดคำภาษาไทยโดยใช้ Maximum Matching algorithm เดติดโค้ดต้นฉบับ คุณ Korakot Chaovavanich จาก https://www.facebook.com/groups/408004796247683/permalink/431283740586455/ และ https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716 ''' import re from marisa_trie import Trie from collections import defaultdict from pythainlp.corpus.thaiword import get_data trie = Trie(get_data()) class LatticeString(str): ''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี ''' def __new__(cls, value, multi=None, in_dict=True): return str.__new__(cls, value) def __init__(self, value, multi=None, in_dict=True): self.unique = True if multi: self.multi = list(multi) if len(self.multi) > 1: self.unique = False else: self.multi = [value] self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า
# -*- coding: utf-8 -*- """ Fork from Peter Norvig's Python codes at http://norvig.com/spell-correct.html """ from __future__ import absolute_import,print_function,unicode_literals from builtins import * from pythainlp.corpus.thaiword import get_data from collections import Counter WORDS = Counter(get_data()) def P(word, N=sum(WORDS.values())): 'Probability of `word`.' return WORDS[word] / N def correction(word): 'แสดงคำที่เป็นไปได้มากที่สุด' return max(spell(word), key=P) def known(words): return list(w for w in words if w in WORDS) def edits1(word): letters = ['ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '\u0e3b', '\u0e3c', '\u0e3d', '\u0e3e', '฿', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์'] splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def edits2(word): return (e2 for e1 in edits1(word) for e2 in edits1(e1)) def spell(word): if word=='': return '' else:
def test_corpus(self): self.assertIsNotNone(alphabet.get_data()) self.assertIsNotNone(country.get_data()) self.assertIsNotNone(tone.get_data()) self.assertIsNotNone(provinces.get_data()) self.assertTrue(len(newthaiword.get_data()) > len(thaiword.get_data()))
# -*- coding: utf-8 -*- from __future__ import absolute_import,division,unicode_literals,print_function from builtins import * ''' โปรแกรม multi-cut ตัดคำภาษาไทยโดยใช้ Maximum Matching algorithm เดติดโค้ดต้นฉบับ คุณ Korakot Chaovavanich จาก https://www.facebook.com/groups/408004796247683/permalink/431283740586455/ และ https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716 ''' import re from marisa_trie import Trie from collections import defaultdict from pythainlp.corpus.thaiword import get_data DEFAULT_DICT_TRIE = Trie(get_data()) class LatticeString(str): ''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี ''' def __new__(cls, value, multi=None, in_dict=True): return str.__new__(cls, value) def __init__(self, value, multi=None, in_dict=True): self.unique = True if multi: self.multi = list(multi) if len(self.multi) > 1: self.unique = False else: self.multi = [value] self.in_dict = in_dict # บอกว่าเป็นคำมีในดิกหรือเปล่า
def test_corpus(self): self.assertEqual(alphabet.get_data()!=None,True) self.assertEqual(country.get_data()!=None,True) self.assertEqual(tone.get_data()!=None,True) self.assertEqual(provinces.get_data()!=None,True) self.assertEqual(len(newthaiword.get_data())>len(thaiword.get_data()),True)