Beispiel #1
0
# -*- coding: utf-8 -*-
from pythainlp.tokenize import dict_word_tokenize, create_custom_dict_trie

data = create_custom_dict_trie("wordlist.txt")
while True:
    text = input("text : ")
    print(dict_word_tokenize(text, custom_dict_trie=data, engine="newmm"))
    print("\r\n")
Beispiel #2
0
# -*- coding: utf-8 -*-
import sqlite3
connection = sqlite3.connect('db.sqlite3')
cursor = connection.execute('select word from word')
wordlist=[i[0] for i in cursor.fetchall()]
#print('\n'.join(wordlist))
print("จำนวนคำ : "+str(len(wordlist)))
connection.close()
from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie,word_tokenize
dictthai=create_custom_dict_trie(wordlist)
while True:
    text=input("text : ")
    if text=="exit":
        break
    print("ผลจาก dict : \t"+'|'.join(dict_word_tokenize(text,dictthai)))
    print("ผลจาก PyThaiNLP : \t"+'|'.join(word_tokenize(text)))
Beispiel #3
0
อนึ่งคือว่า
อีกประการหนึ่ง
อีก
อย่างหนึ่ง""".split("\n") # หน้า 64 http://www.arts.chula.ac.th/~ling/thesis/2556MA-LING-Nalinee.pdf
with codecs.open("corpus.txt", 'r',encoding='utf8') as f:
	lines1 = list(set(normalize(f.read()).splitlines()))
f.close()
test=True#False#True##เปิด/ปิดการ test
#'''
with codecs.open("thai.txt", "r",encoding="utf8") as f:
	lines2 = f.read().splitlines()#'''
'''
from pythainlp.corpus.thaiword import get_data	
lines2 =get_data()'''
data_all=[]
thaiword=create_custom_dict_trie(list(set(ccc+lines2+stopwords+conjunctions)))
print("จำนวนประโยค : "+str(len(lines1)))
for lines in lines1:
	text=dict_word_tokenize(lines,thaiword)
	#text=word_tokenize(lines,thai_tokenize)
	data_all.append(text)
sents=data_all
tokens = []
boundaries = set()
offset = 0
def check_punctuation(text):
	for i in text:
		if i in list(set(punctuation)):
			return True
	return False
def num_there(s):
Beispiel #4
0
[email protected]
"""
import re
import logging
from pythainlp.tag import pos_tag
from pythainlp.ner import thainer
from pythainlp.tokenize import word_tokenize, dict_word_tokenize, create_custom_dict_trie
from .tokenizer import Tokens, Tokenizer
import os
import copy
from . import DEFAULTS

abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
data_dict = create_custom_dict_trie(DEFAULTS['word_corpus_th'])

logger = logging.getLogger(__name__)


class NewmmTokenizer(Tokenizer):
    def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """

        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))