def test_Tokenizer(self):
        _tokenizer = Tokenizer(DEFAULT_WORD_DICT_TRIE)
        self.assertEqual(_tokenizer.word_tokenize(""), [])
        _tokenizer.set_tokenize_engine("longest")
        self.assertEqual(_tokenizer.word_tokenize(None), [])

        _tokenizer = Tokenizer()
        self.assertEqual(_tokenizer.word_tokenize("ก"), ["ก"])
Example #2
0
    def test_Tokenizer(self):
        t_test = Tokenizer(FROZEN_DICT_TRIE)
        self.assertEqual(t_test.word_tokenize(""), [])
        t_test.set_tokenize_engine("longest")
        self.assertEqual(t_test.word_tokenize(None), [])

        t_test = Tokenizer()
        self.assertEqual(t_test.word_tokenize("ก"), ["ก"])
Example #3
0
class ThaiTokenizer(BaseTokenizer):
    def __init__(self, lang='th'):
        self.lang = lang
        self.pyengine = PyThaiTokenizer(
            os.path.join(github_path, 'words_modified.txt'))

    def tokenizer(self, t):
        return self.pyengine.word_tokenize(t)
 def test_longest(self):
     self.assertEqual(longest.segment(None), [])
     self.assertEqual(longest.segment(""), [])
     self.assertIsInstance(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ"),
                           list)
     self.assertEqual(
         word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"),
         ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"],
     )
     longest_tokenizer = Tokenizer(["ปวด", "เฉียบ", "พลัน", "เฉียบพลัน"])
     self.assertEqual(
         longest_tokenizer.word_tokenize("ปวดเฉียบพลัน"),
         ["ปวด", "เฉียบพลัน"],
     )
     self.assertEqual(
         longest_tokenizer.word_tokenize("เฉียบพลัน"),
         ["เฉียบพลัน"],
     )
Example #5
0
def main():
    engineOption = ["newmm", "longest-matching", "dict", "ulmfit"]
    f = codecs.open('input.txt', encoding='utf-8')
    fsort = open("output-sort.csv", "w", encoding="utf-8")

    text = ""
    for line in f:
        # print (line)
        text = text + line

    custom_words_list = set(thai_words())
    custom_words_list.add('รีเทนเนอร์')
    custom_words_list.add('จัดฟัน')
    custom_words_list.add('ฟันชิด')
    trie = dict_trie(dict_source=custom_words_list)
    _tokenizer = Tokenizer(custom_dict=trie, engine='newmm')

    print('------ Starting to tokenize words ------')
    # words = word_tokenize(text, engine=engineOption[0])
    words = _tokenizer.word_tokenize(text)
    i = 0
    wordsNew = ""
    for word in words:
        if word and (not word.isspace(
        )) and word != '-' and word != '/' and not word.isnumeric():
            i = i + 1
            # print(i , ': ' , word.strip() )
            wordsNew = wordsNew + word.strip() + " "
    f.close()

    print('------ Starting to count words: ------')
    wordlist = wordsNew.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w.strip()))
        dictionary = wordListToFreqDict(wordlist)
        sorteddict = sortFreqDict(dictionary)
        i = i + 1
        if (i % 150 == 0):
            print(".")
        else:
            print(".", end='')

    print('------ Starting to sort words and write to file ------')
    for s in sorteddict:
        print(s[1], "|", s[0])
        fsort.write(s[1] + "|" + str(s[0]))
        fsort.write('\n')
    fsort.close()
Example #6
0
def tokenize(request):
    import csv
    KammuangDB = list()
    with open('./KammuangDB.csv','rt')as f:
        data = csv.reader(f)
        for row in data:
            KammuangDB.append(row)
    # return KammuangDB
    
    from pythainlp.corpus.common import thai_words
    from pythainlp.tokenize import Tokenizer

    text = "ขอน้ำบะดาย อู้บ่าดาย อู้เล่นบะได้ก๋า จะไปบึงกาฬ"
    PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txt'
    _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY)
    text_af = _tokenizer.word_tokenize(text)
    # return HttpResponse("E %s" %_tokenizer.word_tokenize(text))
    # def index(request):
    # testvar = 'value'
    # return render(request, 'template.html', {'testvar': testvar})
    
    return render(request, "rrddisplay/tokenize.html", {'text':text,'text_af':text_af,'KammuangDB':KammuangDB})
Example #7
0
with open("neg.txt", 'r') as f:
    for line in f:
        neg.append(line.rstrip())

url = '35213250'
opinions = []
with open(url + ".txt", 'r') as f:
    for line in f:
        opinions.append(line.rstrip())

mydict = pos + neg

tokenizer = Tokenizer(custom_dict=mydict, engine='newmm')

for opinion in opinions:
    neg_count = 0
    pos_count = 0
    print(opinion)
    text = tokenizer.word_tokenize(opinion)
    for word in text:
        if word in pos:
            pos_count = pos_count + 1
        if word in neg:
            neg_count = neg_count + 1

    if pos_count > neg_count:
        print('Positive')
    elif neg_count > pos_count:
        print('Negative')
    else:
        print('Neutral')
Example #8
0
 def test_Tokenizer(self):
     t_test = Tokenizer()
     self.assertEqual(t_test.word_tokenize(""), [])
Example #9
0
 def test_Tokenizer(self):
     t_test = Tokenizer(FROZEN_DICT_TRIE)
     self.assertEqual(t_test.word_tokenize(""), [])
     t_test.set_tokenize_engine("longest")
     self.assertEqual(t_test.word_tokenize(None), [])