def test_word_tokenize_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) self.assertIsInstance(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"), list) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) longest_tokenizer = Tokenizer(["ปวด", "เฉียบ", "พลัน", "เฉียบพลัน"]) self.assertEqual( longest_tokenizer.word_tokenize("ปวดเฉียบพลัน"), ["ปวด", "เฉียบพลัน"], ) self.assertEqual( longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], )
def tokenize(request): import csv KammuangDB = list() with open('./KammuangDB.csv','rt')as f: data = csv.reader(f) for row in data: KammuangDB.append(row) # return KammuangDB from pythainlp.corpus.common import thai_words from pythainlp.tokenize import Tokenizer text = "ขอน้ำบะดาย อู้บ่าดาย อู้เล่นบะได้ก๋า จะไปบึงกาฬ" PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txt' _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY) text_af = _tokenizer.word_tokenize(text) # return HttpResponse("E %s" %_tokenizer.word_tokenize(text)) # def index(request): # testvar = 'value' # return render(request, 'template.html', {'testvar': testvar}) return render(request, "rrddisplay/tokenize.html", {'text':text,'text_af':text_af,'KammuangDB':KammuangDB})
"ห้า": 5, "หก": 6, "เจ็ด": 7, "แปด": 8, "เก้า": 9, } _powers_of_10 = { "สิบ": 10, "ร้อย": 100, "พัน": 1000, "หมื่น": 10000, "แสน": 100000, # "ล้าน" was excluded as a special case } _valid_tokens = set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน"} _tokenizer = Tokenizer(custom_dict=_valid_tokens) def thaiword_to_num(word: str) -> int: """ Converts the spelled-out numerals in Thai scripts into an actual integer. :param str word: Spelled-out numerals in Thai scripts :return: Corresponding integer value of the input :rtype: int :Example: :: from pythainlp.util import thaiword_to_num
'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place', 'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม', 'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้', 'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท', 'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม', 'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ', 'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย', 'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์', 'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน', 'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น', 'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า', 'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน' ] words = set(thai_words()).union(set(custom_list)) _trie = dict_trie(dict_source=words) _tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE) ######################################################## def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย return word in thai_stopwords() def _doc2features(doc, i) -> dict: word = doc[i][0] postag = doc[i][1] # Features from current word features = { "word.word": word,
"เอ็ด": 1, # กำหนดค่าของหน่วยเวลา "โมงเช้า": 6, # เริ่มนับ 7:00 "โมงเย็น": 13, "บ่าย": 13, "บ่ายโมง": 13, "ตี": 0, "เที่ยงวัน": 12, "เที่ยงคืน": 0, "เที่ยง": 12, "ทุ่ม": 18, "นาฬิกา": 0, "ครึ่ง": 30, } _THAI_TIME_CUT = Tokenizer( custom_dict=list(_DICT_THAI_TIME.keys()), engine="newmm" ) def _format_6h(h: int) -> str: """Thai time (6-hour clock).""" text = "" if h == 0: text += "เที่ยงคืน" elif h < 7: text += "ตี" + num_to_thaiword(h) elif h < 12: text += num_to_thaiword(h - 6) + "โมงเช้า" elif h == 12: text += "เที่ยง"
for line in f: pos.append(line.rstrip()) with open("neg.txt", 'r') as f: for line in f: neg.append(line.rstrip()) url = '35213250' opinions = [] with open(url + ".txt", 'r') as f: for line in f: opinions.append(line.rstrip()) mydict = pos + neg tokenizer = Tokenizer(custom_dict=mydict, engine='newmm') for opinion in opinions: neg_count = 0 pos_count = 0 print(opinion) text = tokenizer.word_tokenize(opinion) for word in text: if word in pos: pos_count = pos_count + 1 if word in neg: neg_count = neg_count + 1 if pos_count > neg_count: print('Positive') elif neg_count > pos_count:
"merge_wgts", "pre_rules_th", "post_rules_th", "pre_rules_th_sparse", "post_rules_th_sparse", "process_thai", "_THWIKI_LSTM", ] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _MODEL_NAME_LSTM = "wiki_lm_lstm" _ITOS_NAME_LSTM = "wiki_itos_lstm" _THAI2FIT_WORDS = get_corpus("words_th_thai2fit_201810.txt") _pythainlp_tokenizer = Tokenizer(custom_dict=_THAI2FIT_WORDS, engine="newmm") # Download pretrained models def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name :return: path to downloaded file """ path = get_corpus_path(fname) if not path: download(fname) path = get_corpus_path(fname) return path
:See Also: Inrut, Jeeragone, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. "Thai word segmentation using combination of forward and backward longest matching techniques." In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001. """ import re from typing import List from pythainlp import thai_follow_vowels from pythainlp.corpus import get_corpus from pythainlp.tokenize import Tokenizer _cut_etcc = Tokenizer(get_corpus("etcc.txt"), engine="longest") _PAT_ENDING_CHAR = f"[{thai_follow_vowels}ๆฯ]" _RE_ENDING_CHAR = re.compile(_PAT_ENDING_CHAR) def _cut_subword(tokens: List[str]) -> List[str]: len_tokens = len(tokens) i = 0 while True: if i == len_tokens: break if _RE_ENDING_CHAR.search(tokens[i]) and i > 0 and len(tokens[i]) == 1: tokens[i - 1] += tokens[i] del tokens[i] len_tokens -= 1 i += 1
# -*- coding: utf-8 -*- from typing import List from pythainlp.tokenize import Tokenizer from laonlp.corpus import lao_words _word = Tokenizer(lao_words(), engine="mm") def word_tokenize(sent: str) -> List[str]: """ Lao word tokenize :param str sent: lao text :return: returns a list of lao words :rtype: list """ return _word.word_tokenize(sent) def sent_tokenize(txt: str) -> List[str]: """ Sentence tokenizer. Lao Text to sentence :param str sent: lao text :return: returns a list of lao sentence :rtype: list """ return txt.split(".")
def __init__(self, lang='th'): self.lang = lang self.pyengine = PyThaiTokenizer( os.path.join(github_path, 'words_modified.txt'))
def test_Tokenizer(self): t_test = Tokenizer() self.assertEqual(t_test.word_tokenize(""), [])
"หก": 6, "เจ็ด": 7, "แปด": 8, "เก้า": 9, } _powers_of_10 = { "สิบ": 10, "ร้อย": 100, "พัน": 1000, "หมื่น": 10000, "แสน": 100000, # "ล้าน" was excluded as a special case } _valid_tokens = (set(_digits.keys()) | set(_powers_of_10.keys()) | {"ล้าน", "ลบ"}) _tokenizer = Tokenizer(custom_dict=_valid_tokens) def _check_is_thainum(word: str): for j in list(_digits.keys()): if j in word: return (True, 'num') for j in ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด", "ลบ"]: if j in word: return (True, 'unit') return (False, None) _dict_words = [i for i in list(thai_words()) if not _check_is_thainum(i)[0]] _dict_words += list(_digits.keys()) _dict_words += ["สิบ", "ร้อย", "พัน", "หมื่น", "แสน", "ล้าน", "จุด"]
def test_Tokenizer(self): t_test = Tokenizer(FROZEN_DICT_TRIE) self.assertEqual(t_test.word_tokenize(""), []) t_test.set_tokenize_engine("longest") self.assertEqual(t_test.word_tokenize(None), [])