Beispiel #1
0
 def test_corpus(self):
     self.assertEqual(alphabet.get_data() != None, True)
     self.assertEqual(country.get_data() != None, True)
     self.assertEqual(tone.get_data() != None, True)
     self.assertEqual(provinces.get_data() != None, True)
     self.assertEqual(
         len(newthaiword.get_data()) > len(thaiword.get_data()), True)
Beispiel #2
0
def file_trie(data):
    '''
	ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie
	'''
    path = os.path.join(
        os.path.expanduser("~"),
        'pythainlp-data')  #os.path.join(, 'pthainlp_trie.data')
    if not os.path.exists(path):
        os.makedirs(path)
    if data == "newmm":
        path = os.path.join(path, 'pythainlp_trie-tcc1.data')
    elif data == "old":
        path = os.path.join(path, 'pythainlp_trie2.data')
    else:
        path = os.path.join(path, 'pythainlp_trie2.data')
    if not os.path.exists(path):
        #ถ้าไม่มีไฟล์
        if data == "newmm":
            from pythainlp.corpus.thaiword import get_data  # ข้อมูลเก่า
            data2 = get_data()
            i = 0
            while i < len(data2):
                data2[i] = tcc.tcc(data2[i], sep='#')
                if (data2[len(data2[i]) - 1] != "#"):
                    data2[i] += "#"
                i += 1
            data = data2
        elif data == 'old':
            from pythainlp.corpus.thaiword import get_data  # ข้อมูลเก่า
            data = get_data()
        else:
            from pythainlp.corpus.newthaiword import get_data  # ข้อมูลใหม่
            data = get_data()
        with open(path, 'wb') as dill_file:
            dill.dump(marisa_trie.Trie(data), dill_file)
        dill_file.close()
    with open(path, 'rb') as dill_file:
        data = dill.load(dill_file)
    dill_file.close()
    return data
Beispiel #3
0
def file_trie(data):
	'''
	ใช้สร้างไฟล์ข้อมูลสำหรับระบบที่ใช้ trie
	'''
	path = get_path_pythainlp_data()
	if not os.path.exists(path):
		os.makedirs(path)
	if data=="newmm":
		path = os.path.join(path, 'pythainlp_trie-tcc1.data')
	elif data=="old":
		path = os.path.join(path, 'pythainlp_trie2.data')
	else:
		path = os.path.join(path, 'pythainlp_trie2.data')
	if not os.path.exists(path):
		#ถ้าไม่มีไฟล์
		if data=="newmm":
			from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า
			data2=get_data()
			i=0
			while i<len(data2):
				data2[i]=tcc.tcc(data2[i],sep='#')
				if(data2[len(data2[i])-1]!="#"):
					data2[i]+="#"
				i+=1
			data=data2
		elif data=='old':
			from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า
			data=get_data()
		else:
			from pythainlp.corpus.newthaiword import get_data # ข้อมูลใหม่
			data=get_data()
		with open(path,'wb') as dill_file:
			dill.dump(marisa_trie.Trie(data),dill_file)
		dill_file.close()
	with open(path,'rb') as dill_file:
		data=dill.load(dill_file)
	dill_file.close()
	return data
Beispiel #4
0
def tcut(text):
    #global last_p, i, q, ww   # for debug
    trie = Trie(get_data())
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + '/' + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len(text):
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in trie.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            # skip น้อยที่สุด ที่เป็นไปได้
            for i in range(p, len(text)):
                ww = trie.prefixes(text[i:])
                if ww:
                    break
            else:
                i = len(text)
            w = text[p:i]
            w = w.replace(' ', '')  # ลบค่าที่ว่าง
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Beispiel #5
0
def tagger(sent):
      return bayes.classify(' '.join(mmcut(sent,[i[0] for i in patterns]+get_data())))
Beispiel #6
0
cัtวะ
c[ัื]tc[ุิะ]?
c[ิุู]์
c[ะ-ู]t
c็
ct[ะาำ]?
แc็c
แcc์
แctะ
แcc็c
แccc์
โctะ
[เ-ไ]ct
""".replace('c', '[ก-ฮ]').replace('t', '[่-๋]?').split()

THAI_WORDS = Trie(get_data())


def tcc(w):
    p = 0
    pat = re.compile("|".join(pat_tcc))
    while p < len(w):
        m = pat.match(w[p:])
        if m:
            n = m.span()[1]
        else:
            n = 1
        yield w[p:p + n]
        p += n

# -*- coding: utf-8 -*-
from pythainlp.tokenize.newmm import mmcut
from pythainlp.corpus.thaiword import get_data
import simplebayes
import nltk.tag
import dill
#from pythainlp.corpus.thaiword import get_data
import nltk.tag
sentences=input("text : ")
with open('patterns-classify-word-thai.data', 'rb') as in_strm:
      patterns = dill.load(in_strm)
in_strm.close()
with open('bayes-classify-word-thai.data', 'rb') as in_strm:
      bayes = dill.load(in_strm)
in_strm.close()
with open('classify-word-thai.data', 'rb') as in_strm:
      tagger = dill.load(in_strm)
in_strm.close()
r=tagger(sentences)
print(r)
print(bayes.score(' '.join(mmcut(sentences,[i[0] for i in patterns]+get_data())))) # บอกความน่าจะเป็นของ tag
Beispiel #8
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import,division,unicode_literals,print_function
from builtins import *
# Longest matching
# โค้ดจาก https://stackoverflow.com/a/11642687
from pythainlp.corpus.thaiword import get_data # ข้อมูลเก่า
from math import log
words=get_data()
import re
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def segment(s):
    """ตัดคำภาษาไทยด้วย Longest matching"""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    data = re.split(r'\n|\s+',s) # แยกช่องว่างและขึ้นประโยคใหม่
    outall=''
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
        return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
    # Build the cost array.
    countlist=0
    while countlist<len(data):
        s=data[countlist]
        cost = [0]
        for i in range(1,len(s)+1):
            c,k = best_match(i)
            cost.append(c)
        # Backtrack to recover the minimal-cost string.
Beispiel #9
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, unicode_literals, print_function
from builtins import *
'''
ตัดคำภาษาไทยโดยใช้ Maximum Matching algorithm
เดติดโค้ดต้นฉบับ คุณ Korakot Chaovavanich
จาก https://www.facebook.com/groups/408004796247683/permalink/431283740586455/
และ https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716
'''
import re
from marisa_trie import Trie
from collections import defaultdict
from pythainlp.corpus.thaiword import get_data
trie = Trie(get_data())


class LatticeString(str):
    ''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
    '''
    def __new__(cls, value, multi=None, in_dict=True):
        return str.__new__(cls, value)

    def __init__(self, value, multi=None, in_dict=True):
        self.unique = True
        if multi:
            self.multi = list(multi)
            if len(self.multi) > 1:
                self.unique = False
        else:
            self.multi = [value]
        self.in_dict = in_dict  # บอกว่าเป็นคำมีในดิกหรือเปล่า
Beispiel #10
0
# -*- coding: utf-8 -*-
"""
Fork from Peter Norvig's Python codes at http://norvig.com/spell-correct.html
"""
from __future__ import absolute_import,print_function,unicode_literals
from builtins import *
from pythainlp.corpus.thaiword import get_data
from collections import Counter
WORDS = Counter(get_data())
def P(word, N=sum(WORDS.values())):
    'Probability of `word`.'
    return WORDS[word] / N
def correction(word):
    'แสดงคำที่เป็นไปได้มากที่สุด'
    return max(spell(word), key=P)
def known(words):
    return list(w for w in words if w in WORDS)
def edits1(word):
    letters = ['ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '\u0e3b', '\u0e3c', '\u0e3d', '\u0e3e', '฿', 'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์']
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word):
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def spell(word):
    if word=='':
        return ''
    else:
Beispiel #11
0
 def test_corpus(self):
     self.assertIsNotNone(alphabet.get_data())
     self.assertIsNotNone(country.get_data())
     self.assertIsNotNone(tone.get_data())
     self.assertIsNotNone(provinces.get_data())
     self.assertTrue(len(newthaiword.get_data()) > len(thaiword.get_data()))
# -*- coding: utf-8 -*-
from __future__ import absolute_import,division,unicode_literals,print_function
from builtins import *
'''
โปรแกรม multi-cut
ตัดคำภาษาไทยโดยใช้ Maximum Matching algorithm
เดติดโค้ดต้นฉบับ คุณ Korakot Chaovavanich
จาก https://www.facebook.com/groups/408004796247683/permalink/431283740586455/
และ https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716
'''
import re
from marisa_trie import Trie
from collections import defaultdict
from pythainlp.corpus.thaiword import get_data
DEFAULT_DICT_TRIE = Trie(get_data())
class LatticeString(str):
    ''' String subclass เพื่อเก็บวิธีตัดหลายๆ วิธี
    '''
    def __new__(cls, value, multi=None, in_dict=True):
        return str.__new__(cls, value)

    def __init__(self, value, multi=None, in_dict=True):
        self.unique = True
        if multi:
            self.multi = list(multi)
            if len(self.multi) > 1:
                self.unique = False
        else:
            self.multi = [value]
        self.in_dict = in_dict   # บอกว่าเป็นคำมีในดิกหรือเปล่า
Beispiel #13
0
	def test_corpus(self):
		self.assertEqual(alphabet.get_data()!=None,True)
		self.assertEqual(country.get_data()!=None,True)
		self.assertEqual(tone.get_data()!=None,True)
		self.assertEqual(provinces.get_data()!=None,True)
		self.assertEqual(len(newthaiword.get_data())>len(thaiword.get_data()),True)