Python Tokenizerの例、pythainlp.Tokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: flask_custom_json.py プロジェクト: oosuan/wordcloud_py

def test():
    body = json.loads(request.get_data())
    text = body['text']
    try:
        custom_stopwords = body['custom_stopwords']
    except KeyError:
        custom_stopwords = [""]
    try:
        custom_dict = body['custom_dict']
    except KeyError:
        custom_dict = [""]
    #รับ input จาก user

    stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords
    map(lambda stop_words: stop_words.lower(), stop_words)
    #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud

    pythainlp_words = thai_words()
    dictionary = list(pythainlp_words) + custom_dict
    #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่"

    tok = Tokenizer(dictionary)
    #ตั้งตัวแปรเพื่อแยกคำ

    text = tok.word_tokenize(text)
    text = ' '.join(text)
    text = text.lower()
    #ทำการแยกคำ

    wordcloud = WordCloud(stopwords=stop_words,
                          font_path='THSarabunNew.ttf',
                          min_word_length=2,
                          relative_scaling=1.0,
                          min_font_size=1,
                          background_color="black",
                          width=800,
                          height=600,
                          scale=10,
                          font_step=1,
                          collocations=False,
                          colormap="gist_ncar",
                          regexp=r"[\u0E00-\u0E7Fa-zA-Z']+",
                          margin=2).generate(text)
    #ทำการ generate wordcloud

    plt.figure(figsize=(16, 9))
    plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
    #ทำการวาง wordcloud

    wordcloud.to_file('wordcloud.png')
    gc.collect()
    #เซฟรูปลง server และคลีนแรม

    return send_file('wordcloud.png')

コード例 #2

0

ファイルを表示

def gen_worldcloud():
    # get text, custom_stopwords and custom_dict
    body = json.loads(request.get_data())
    text = body['text']
    custom_stopwords = body[
        'custom_stopwords'] if 'custom_stopwords' in body else []
    custom_dict = body['custom_dict'] if 'custom_dict' in body else []

    # เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่"
    # จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่"
    stop_words = DEFAULT_STOPWORLS + custom_stopwords
    dictionary = DEFAULT_DICT + custom_dict

    # word preparing
    stop_words = word_preparing(stop_words)
    dictionary = word_preparing(dictionary)

    # ทำการแยกคำ/ตัดคำ
    tok = Tokenizer(dictionary)
    tokens = tok.word_tokenize(text)
    text = ' '.join(tokens)  # convert tokens เป็น string
    text = text.lower()  # ทำเป็น lower-case

    # ทำการ generate wordcloud
    wordcloud = WordCloud(stopwords=stop_words,
                          font_path='THSarabunNew.ttf',
                          min_word_length=2,
                          relative_scaling=1.0,
                          min_font_size=1,
                          background_color="black",
                          width=800,
                          height=600,
                          scale=10,
                          font_step=1,
                          collocations=False,
                          colormap="gist_ncar",
                          regexp=r"[\u0E00-\u0E7Fa-zA-Z']+",
                          margin=2).generate(text)

    # เซฟรูปและคลีนแรม
    wordcloud.to_file(IMAGE_FILE)
    gc.collect()

    # response รูป
    return send_file(IMAGE_FILE)

コード例 #3

0

ファイルを表示

ファイル: nlp_process.py プロジェクト: hkbtotw/traffic-monitor

    def ProcessText(self, text):

        dataBase = database()
        streetDf, addressDf = dataBase.ReadStreetName()

        streetList = dataBase.DataframeToList(streetDf)
        addressList = dataBase.DataframeToList(addressDf)
        districtList = dataBase.districtName_
        wordList = districtList + streetList + addressList

        custom_words_list = set(thai_words())
        custom_words_list.update(wordList)
        custom_words_list.update(self.specWord)

        trie = dict_trie(dict_source=custom_words_list)

        custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel)
        proc = custom_tokenizer.word_tokenize(text)

        cleanList_1 = []
        cleanList = []
        [
            cleanList_1.append(
                i.translate(str.maketrans('', '', string.punctuation)))
            for i in proc
        ]
        [
            cleanList.append(i.translate(str.maketrans('', '', '1234567890')))
            for i in cleanList_1
        ]

        procText = list(filter(lambda x: x != " ", proc))
        procText = list(filter(lambda x: x != "  ", procText))
        procText = list(filter(lambda x: x != "", procText))
        #procText = list(filter(lambda x: len(x)>2, procText))
        joinText = ' '.join(procText)
        #print(joinText)
        return joinText

コード例 #4

0

ファイルを表示

ファイル: PyThaiNLP_tutorial.py プロジェクト: Music-Wandee/Workshop

from pythainlp import word_tokenize, Tokenizer
text = "กฎหมายแรงงานฉบับปรับปรุงใหม่ประกาศใช้แล้ว"

'''
Options for engine
newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
longest - dictionary-based, Longest Matching
deepcut - wrapper for deepcut, language-model-based
icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
ulmfit - for thai2fit

see more: https://thainlp.org/pythainlp/docs/2.0/api/tokenize.html
'''

print("newmm:", word_tokenize(text))  # default engine is "newmm"
print("longest:", word_tokenize(text, engine="longest"))

words = ["กฎ", "งาน"]
custom_tokenizer = Tokenizer(words)
print("custom:", custom_tokenizer.word_tokenize(text))

コード例 #5

0

ファイルを表示

ファイル: flask_text_input.py プロジェクト: oosuan/wordcloud_py

from flask import Flask, request, send_file, after_this_request, render_template, redirect
import numpy as np
import random
import matplotlib

stop_words = list(thai_stopwords()) + list(STOPWORDS) +\
             ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"]
map(lambda stop_words: stop_words.lower(), stop_words)

pythainlp_words = thai_words()
custom_dict = [
    'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ'
]
dictionary = list(pythainlp_words) + list(custom_dict)

tok = Tokenizer(dictionary)


class main_flask():
    app = Flask(__name__)

    @app.route('/', methods=['GET'])
    def upload_file():
        return render_template('upload_text_redirect.html')

    @app.route("/test", methods=['POST'])
    def test():
        texts = request.form['texts']
        texts = tok.word_tokenize(texts)
        text2 = ' '.join(texts)
        text2 = text2.lower()

コード例 #6

0

ファイルを表示

ファイル: train.py プロジェクト: YanisaW/CE-27

 "ฟอกฟัน", "ฟอกสีฟัน",
 "ขูดหินปูน", "แอร์โฟลว์", "แอร์โฟล", "แอร์โฟ", "แอร์โฟร์",
 "โรคเหงือก",
 "เลเซอร์เหงือก", "เลเซอร์", "เลเซอเหงือก", "เลเสอเหงือก", "เลเสอร์เเหงือก", "เลเซอร์เหงือกชมพู", "เลเซอเหงือกชมพู", "เลเสอเหงือกชมพูู", "เลเสอร์เเหงือกชมพู",
 "ศัลยกรรมเหงือก", "ตกแต่งเหงือก", "ศัลย์เหงือก", "ผ่าเหงือก", "ตัดเหงือก", "ตกแต่ง", "ศัลยกรรม",
 "รักษารากฟัน",
 "อุดฟัน",
 "ถอนฟัน",
 "ผ่าฟันคุด", "ฟันคุด", "ถอนฟันคุด"
]
custom_words_list = set(thai_words())
## add multiple words
custom_words_list.update(words)
## add word
trie = dict_trie(dict_source=custom_words_list)
custom_tokenizer = Tokenizer(custom_dict=trie, engine='newmm')


# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']

    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        # w = tokenize(pattern)
        w = custom_tokenizer.word_tokenize(pattern)

        # add to our words list
        all_words.extend(w)

コード例 #7

0

ファイルを表示

ファイル: analysis.py プロジェクト: yakung/requirement-classification

    def __init__(self, word, weight):
        self.word = word  # Type: String
        self.weight = weight  # Type: Integer


class TreeNode():
    def __init__(self, data):
        self.data = data  # Type: <Dynamic>
        self.children = list()  # Type: List<Dynamic>


THAI_WORDS = set(thai_words())
for i in open('requirement/data/custom_tokenizer.txt', encoding="utf-8"):
    THAI_WORDS.add(i.replace('\n', '').strip())

TOKENIZER = Tokenizer(THAI_WORDS)

KEYWORDS_HIGH_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-high.csv')).tolist()
]

KEYWORDS_MEDIUM_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-medium.csv')).tolist()
]

KEYWORDS_LOW_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-low.csv')).tolist()
]

コード例 #8

0

ファイルを表示

def Processing(E1):

    p_stemmer = PorterStemmer()

    ThaiWord = list(thaisw.words('thai'))
    #print(' Thaiwords : ', ThaiWord)
    EngWord = list(set(engsw.words('english')))
    #print(' ew : ',EngWord, ' : ', type(EngWord))
    Morewords = [
        u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ',
        u'test', u'.', u',', u'ทำ', u'-', u'/'
    ]
    All_Stop_Word = ThaiWord + EngWord + Morewords
    #print(' ALL : ',All_Stop_Word)

    EntryList = []
    for n in E1:
        # check=detect(n[0])   # th or en
        #print(' text : ', n[0], ' :: ',check)
        EntryList.append(n[0])

        #print(' EntryList : ', EntryList)

        Outcome = []
    for r in EntryList:
        Dummy = []
        tokens = []
        tokens = list(eng_tokens(r))
        lowered = [t.lower() for t in tokens]
        #print(' Dummy : ',lowered)
        lowered = " ".join(lowered)
        #Dummy=list(thai_tokens(lowered, engine='newmm'))
        words = set(thai_words())
        words.add(u'ไทยเบฟ')
        words.add(u'ผสานพลัง')
        words.add(u'โอกาส')
        words.add(u'ถังไม้โอ๊ค')
        custom_tokenizer = Tokenizer(words)
        Dummy = list(custom_tokenizer.word_tokenize(lowered))
        #print(' Dummy 2 : ',Dummy)
        Outcome.append(Dummy)

    #print(' Outcome : ',Outcome, ' : ', len(Outcome))

    NoStop = []
    for n in Outcome:
        Dummy = []
        Dummy = [word for word in n if word not in All_Stop_Word]
        NoStop.append(Dummy)

    print(' No stop : ', NoStop, ' len: ', len(NoStop))

    Lemma = []
    for n in NoStop:
        Dummy = []
        Dummy = [p_stemmer.stem(word) for word in n]
        Lemma.append(Dummy)

    print(' Lemma : ', Lemma, ' len: ', len(Lemma))
    '''
    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatize all tokens into a new list: lemmatized
    Lemma=[]
    for n in NoStop:
        Dummy=[]
        Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n]
        Lemma.append(Dummy)
    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    '''

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        for i in n:
            w_syn = wordnet.synsets(i)
            if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
                Dummy.append(w_syn[0].lemma_names('tha')[0])
            else:
                Dummy.append(i)
        Lemma_temp.append(Dummy)

    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not i.isnumeric()]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not ' ' in i]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    return Lemma

コード例 #9

0

ファイルを表示

        positive_vocab.append(line.rstrip())

with open("swear-words.txt", 'r', encoding="utf8") as f:
    for line in f:
        swear_words.append(line.rstrip())

with open("neu.txt", 'r', encoding="utf8") as f:
    for line in f:
        neutral_vocab.append(line.rstrip())

sentences = input("insert thai text ")

pythainlp_words = thai_words()
custom_dict = ["ลุงตู่"]
dictionary = list(pythainlp_words) + custom_dict
tok = Tokenizer(dictionary)

tokens = []
tokens = tok.word_tokenize(sentences)
tokens = ' '.join(tokens)
tokens = [tokens]

for sentence in tokens:
    pos = 0
    neu = 0
    neg = 0
    pred = []
    print(sentence)
    words = sentence.split(' ')
    for word in words:
        if word in positive_vocab: