def test_Tokenizer(self): _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE) self.assertEqual(_tokenizer.word_tokenize(""), []) _tokenizer.set_tokenize_engine("longest") self.assertEqual(_tokenizer.word_tokenize(None), []) _tokenizer = Tokenizer() self.assertEqual(_tokenizer.word_tokenize("ก"), ["ก"])
def test_Tokenizer(self): t_test = Tokenizer(FROZEN_DICT_TRIE) self.assertEqual(t_test.word_tokenize(""), []) t_test.set_tokenize_engine("longest") self.assertEqual(t_test.word_tokenize(None), []) t_test = Tokenizer() self.assertEqual(t_test.word_tokenize("ก"), ["ก"])
class ThaiTokenizer(BaseTokenizer): def __init__(self, lang='th'): self.lang = lang self.pyengine = PyThaiTokenizer( os.path.join(github_path, 'words_modified.txt')) def tokenizer(self, t): return self.pyengine.word_tokenize(t)
def test_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) self.assertIsInstance(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"), list) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) longest_tokenizer = Tokenizer(["ปวด", "เฉียบ", "พลัน", "เฉียบพลัน"]) self.assertEqual( longest_tokenizer.word_tokenize("ปวดเฉียบพลัน"), ["ปวด", "เฉียบพลัน"], ) self.assertEqual( longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], )
def main(): engineOption = ["newmm", "longest-matching", "dict", "ulmfit"] f = codecs.open('input.txt', encoding='utf-8') fsort = open("output-sort.csv", "w", encoding="utf-8") text = "" for line in f: # print (line) text = text + line custom_words_list = set(thai_words()) custom_words_list.add('รีเทนเนอร์') custom_words_list.add('จัดฟัน') custom_words_list.add('ฟันชิด') trie = dict_trie(dict_source=custom_words_list) _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') print('------ Starting to tokenize words ------') # words = word_tokenize(text, engine=engineOption[0]) words = _tokenizer.word_tokenize(text) i = 0 wordsNew = "" for word in words: if word and (not word.isspace( )) and word != '-' and word != '/' and not word.isnumeric(): i = i + 1 # print(i , ': ' , word.strip() ) wordsNew = wordsNew + word.strip() + " " f.close() print('------ Starting to count words: ------') wordlist = wordsNew.split() wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w.strip())) dictionary = wordListToFreqDict(wordlist) sorteddict = sortFreqDict(dictionary) i = i + 1 if (i % 150 == 0): print(".") else: print(".", end='') print('------ Starting to sort words and write to file ------') for s in sorteddict: print(s[1], "|", s[0]) fsort.write(s[1] + "|" + str(s[0])) fsort.write('\n') fsort.close()
def tokenize(request): import csv KammuangDB = list() with open('./KammuangDB.csv','rt')as f: data = csv.reader(f) for row in data: KammuangDB.append(row) # return KammuangDB from pythainlp.corpus.common import thai_words from pythainlp.tokenize import Tokenizer text = "ขอน้ำบะดาย อู้บ่าดาย อู้เล่นบะได้ก๋า จะไปบึงกาฬ" PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txt' _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY) text_af = _tokenizer.word_tokenize(text) # return HttpResponse("E %s" %_tokenizer.word_tokenize(text)) # def index(request): # testvar = 'value' # return render(request, 'template.html', {'testvar': testvar}) return render(request, "rrddisplay/tokenize.html", {'text':text,'text_af':text_af,'KammuangDB':KammuangDB})
with open("neg.txt", 'r') as f: for line in f: neg.append(line.rstrip()) url = '35213250' opinions = [] with open(url + ".txt", 'r') as f: for line in f: opinions.append(line.rstrip()) mydict = pos + neg tokenizer = Tokenizer(custom_dict=mydict, engine='newmm') for opinion in opinions: neg_count = 0 pos_count = 0 print(opinion) text = tokenizer.word_tokenize(opinion) for word in text: if word in pos: pos_count = pos_count + 1 if word in neg: neg_count = neg_count + 1 if pos_count > neg_count: print('Positive') elif neg_count > pos_count: print('Negative') else: print('Neutral')
def test_Tokenizer(self): t_test = Tokenizer() self.assertEqual(t_test.word_tokenize(""), [])
def test_Tokenizer(self): t_test = Tokenizer(FROZEN_DICT_TRIE) self.assertEqual(t_test.word_tokenize(""), []) t_test.set_tokenize_engine("longest") self.assertEqual(t_test.word_tokenize(None), [])