Exemple #1
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertEqual(get_corpus_db_detail("XXX"),
                         {})  # corpus does not exist
        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertTrue(download(name="test", version="0.1"))
        self.assertTrue(remove("test"))
Exemple #2
0
def text_to_bow_stopword(tokenized_text, vocabulary_):
    """ฟังก์ชันเพื่อแปลงลิสต์ของ tokenized text เป็น sparse matrix"""
    n_doc = len(tokenized_text)
    values, row_indices, col_indices = [], [], []
    stop_words = set(thai_stopwords())
    for r, tokens in enumerate(tokenized_text):
        # print('r:', r)
        # print('tokens:', tokens)
        filtered_sentence = [w for w in tokens if not w in stop_words]
        # print(r,frequency_word(filtered_sentence))
        # print('filtered_sentence',filtered_sentence)
        feature = {}
        for token in filtered_sentence:
            word_index = vocabulary_.get(token)
            if word_index is not None:
                if word_index not in feature.keys():
                    feature[word_index] = 1
                else:
                    feature[word_index] += 1
        # print('r:',r)
        # print('feature:',feature)
        for c, v in feature.items():
            values.append(v)
            row_indices.append(r)
            col_indices.append(c)

    # document-term matrix in sparse CSR format
    X = sp.csr_matrix((values, (row_indices, col_indices)),
                      shape=(n_doc, len(vocabulary_)))
    return X
Exemple #3
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_family_names(), frozenset)
        self.assertIsInstance(list(thai_family_names())[0], str)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist
        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertFalse(download(name="test", version="0.0"))
        self.assertFalse(download(name="test", version="0.0.0"))
        self.assertFalse(download(name="test", version="0.0.1"))
        self.assertFalse(download(name="test", version="0.0.2"))
        self.assertFalse(download(name="test", version="0.0.3"))
        self.assertFalse(download(name="test", version="0.0.4"))
        self.assertIsNotNone(download(name="test", version="0.0.5"))
        self.assertTrue(download("test"))
        self.assertIsNotNone(remove("test"))  # remove existing
        self.assertIsNotNone(download(name="test", version="0.0.6"))
        self.assertIsNotNone(download(name="test", version="0.0.7"))
        self.assertIsNotNone(download(name="test", version="0.0.8"))
        self.assertIsNotNone(download(name="test", version="0.0.9"))
        self.assertIsNotNone(download(name="test", version="0.0.10"))
        with self.assertRaises(Exception) as context:
            self.assertIsNotNone(download(name="test", version="0.0.11"))
        self.assertTrue(
            "Hash does not match expected." in str(context.exception))
        self.assertIsNotNone(download(name="test", version="0.1"))
        self.assertIsNotNone(remove("test"))
Exemple #4
0
def is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    """
    Check if a word is stop word or not using PyThaiNLP

    Reference
    ----------
    Pythainlp, https://github.com/PyThaiNLP/pythainlp
    """
    return word in thai_stopwords()
Exemple #5
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
def filter_words(text):
    text = text.replace('\n', ' ')
    text = text.replace(',', ' ')
    stop_words = set(thai_stopwords())
    tokens = word_tokenize(text, engine="newmm", keep_whitespace=False)
    filtered_text = []
    for w in tokens:
        if w not in stop_words:
            filtered_text.append(w)
    return filtered_text
Exemple #7
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
def test():
    body = json.loads(request.get_data())
    text = body['text']
    try:
        custom_stopwords = body['custom_stopwords']
    except KeyError:
        custom_stopwords = [""]
    try:
        custom_dict = body['custom_dict']
    except KeyError:
        custom_dict = [""]
    #รับ input จาก user

    stop_words = list(thai_stopwords()) + list(STOPWORDS) + custom_stopwords
    map(lambda stop_words: stop_words.lower(), stop_words)
    #ส่วนนี้คือส่วนที่เราใส่คำที่ห้ามโชว์ขึ้นไปใน wordcloud

    pythainlp_words = thai_words()
    dictionary = list(pythainlp_words) + custom_dict
    #เพิ่มคำที่ไม่มีใน dict ของภาษาไทยหรือภาษาอังกฤษเข้าไปให้เป็นคำเช่นถ้าเรา input "ลุงตู่" จะออกมาเป็น "ลุง","ตู่" แต่ถ้าเราเพิ่ม dict เข้าไป output จะเป็น "ลุงตู่"

    tok = Tokenizer(dictionary)
    #ตั้งตัวแปรเพื่อแยกคำ

    text = tok.word_tokenize(text)
    text = ' '.join(text)
    text = text.lower()
    #ทำการแยกคำ

    wordcloud = WordCloud(stopwords=stop_words,
                          font_path='THSarabunNew.ttf',
                          min_word_length=2,
                          relative_scaling=1.0,
                          min_font_size=1,
                          background_color="black",
                          width=800,
                          height=600,
                          scale=10,
                          font_step=1,
                          collocations=False,
                          colormap="gist_ncar",
                          regexp=r"[\u0E00-\u0E7Fa-zA-Z']+",
                          margin=2).generate(text)
    #ทำการ generate wordcloud

    plt.figure(figsize=(16, 9))
    plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation='bilinear')
    plt.axis("off")
    #ทำการวาง wordcloud

    wordcloud.to_file('wordcloud.png')
    gc.collect()
    #เซฟรูปลง server และคลีนแรม

    return send_file('wordcloud.png')
Exemple #9
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     self.assertIsNotNone(thai_female_names())
     self.assertIsNotNone(thai_male_names())
     self.assertEqual(get_corpus_db_detail("XXX"), {})
     self.assertIsNone(download("test"))
     self.assertIsNone(download("test", force=True))
     self.assertIsNotNone(get_corpus_db_detail("test"))
     self.assertIsNotNone(remove("test"))
     self.assertFalse(remove("test"))
Exemple #10
0
 def word_freq_nostop(self, topn=30):
     count = collections.Counter()
     for line in self.lines:
         tokens = line.split(' ')[1:]  # exclude id 
         id = line.split(' ')[0]
         for token in tokens:
             if token != ' ' and token != '' and token not in corpus.thai_stopwords() and token not in ['(',')']:
                 count[token] += 1
     
     # print topn: rank, word, tokens, tokens/10K
     print(sum(count.values()))
     most = count.most_common(topn)
     for i in range(topn):
         print('| {} | {} | {} | {:.3f} |'.format(i+1, most[i][0], most[i][1], most[i][1]/1734185*10000))
     
     # return counter
     return count
Exemple #11
0
def trainModel(training_data):
    import nltk.classify
    from sklearn.svm import LinearSVC
    from pythainlp.tokenize import word_tokenize
    from pythainlp.corpus import thai_stopwords
    from itertools import chain
    print('split word ...')
    vocabulary = set(
        chain(*[(set(word_tokenize(i[0])) - set(thai_stopwords()))
                for i in training_data]))
    #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))
    print('exact feature ...')
    feature_set = [({i: (i in word_tokenize(sentence))
                     for i in vocabulary}, tag)
                   for sentence, tag in training_data]
    print('train model ...')
    classifier = nltk.classify.SklearnClassifier(LinearSVC())
    classifier.train(feature_set)
    saveModel(vocabulary, classifier)
Exemple #12
0
    def make_tfidf(self, stop=False):
        self.tf_dic = {}
        self.idf_dic = {}
        for line in self.lines:
            if len(line.split()) > 1:
                document = line.split()[0]
                tokens = line.split()[1:]
                if document not in self.tf_dic:
                    self.tf_dic[document] = {}
                for token in tokens:
                    # tf
                    if not (stop and token in corpus.thai_stopwords()):
                        self.tf_dic[document][token] = self.tf_dic[document].get(token, 0) + 1

                    # idf
                    if token not in self.idf_dic:
                        self.idf_dic[token] = set(document)
                    else:
                        self.idf_dic[token].add(document)
        self.N = len(self.tf_dic.keys())
Exemple #13
0
def trainModel(training_data):
    import logging
    logging.getLogger().setLevel(logging.INFO)
    import nltk.classify
    from sklearn.svm import LinearSVC
    from pythainlp.tokenize import word_tokenize
    from pythainlp.corpus import thai_stopwords
    from itertools import chain
    logging.warning(' ============ TRAINNING MODEL ============')
    logging.info('split word ...')
    vocabulary = set(
        chain(*[(set(word_tokenize(i[0])) - set(thai_stopwords()))
                for i in training_data]))
    #vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))
    logging.info('exact feature ...')
    feature_set = [({i: (i in word_tokenize(sentence))
                     for i in vocabulary}, tag)
                   for sentence, tag in training_data]
    classifier = nltk.classify.SklearnClassifier(LinearSVC())
    classifier.train(feature_set)
    saveModel(vocabulary, classifier)
    def split_word(text):
        """
        Split word to token and Remove stop word
        """
        train_text = ""
        #Remove special charecter
        pattern = re.compile(r"[^\u0E00-\u0E7Fa-zA-Z' ]|^'|'$|''")
        remove_char = re.findall(pattern, text)
        list_with_removed_char = [char for char in text if not char in remove_char]
        train_text = ''.join(list_with_removed_char)

        #Split word by using for Maximum Matching algorithm,
        tokens = word_tokenize(train_text, engine='newmm')
        #Remove thai stop word and eng stop word
        stopped_tokens = [i for i in tokens if not i in thai_stopwords() and i not in get_stop_words('en')]
        #Word stemming
    #     stemmed_tokens = [PorterStemmer().stem(i) for i in stopped_tokens]
        #Remove ETC
        deletelist = [' ','  ','   ', '    ', '\n', '\xa0','\x0c', "'",'cid']
        tokens = [i for i in stopped_tokens if not i in deletelist]

        return tokens
    def __init__(self,
                 df,
                 n_docs=1000,
                 stoplist=thai_stopwords(),
                 smooth_idf=True,
                 sentence_tokenize=pythai.tokenize.sent_tokenize,
                 word_tokenize=pythai.tokenize.word_tokenize,
                 preprocessor=custom_preprocess):
        texts = df['body_text']
        self.sentence_tokenize = sentence_tokenize
        self.word_tokenize = word_tokenize
        self.n_docs = n_docs
        self.stoplist = stoplist
        self.smooth_idf = smooth_idf
        self.preprocessor = preprocessor

        count_vect = self.get_new_countvect()
        self.docs_word_freq = count_vect.fit_transform(
            texts).toarray()  # (docs, words)
        self.docs_word_freq = np.where(self.docs_word_freq > 0, 1, 0)
        self.docs_word_freq = np.sum(self.docs_word_freq, axis=0)
        # self.docs_word_freq /= np.sum(self.docs_word_freq)
        self.docs_vocab = count_vect.vocabulary_
Exemple #16
0
from flask import Flask, request, json
import pandas as pd
from gensim.models import Word2Vec
from pythainlp.tokenize import word_tokenize
import pythainlp.corpus as st
import numpy as np
import pymongo
from bson.objectid import ObjectId

client = pymongo.MongoClient(
    "mongodb+srv://tum123456:[email protected]/<dbname>?retryWrites=true&w=majority"
)
db = client.test
mydb = db["MyProject"]

words = st.thai_stopwords()
data = pd.read_excel("Book1.xlsx", index_col=0)
word_not_important = [
    'หา', 'รับ', 'งาน', 'ทำหน้าที่', 'หน้าที่', 'ซ่อม', '(', ')', '/'
]
app = Flask(__name__)


def lower(x):
    return x.lower()


def concerned_sentence(search, allsentence):
    arr = []
    for i in search:
        for j in allsentence:
Exemple #17
0
    # ลบ เครื่องหมายคำพูด (punctuation)
    for c in string.punctuation:
        msg = re.sub(r'\{}'.format(c), '', msg)

    # ลบ separator เช่น \n \t
    msg = ' '.join(msg.split())

    return msg


clean_text = [clean_msg(txt) for txt in text_tweets]

# โหลดตัว stop word  ทั้งไทยและอังกฤษ
nltk.download('words')
th_stop = tuple(thai_stopwords())
en_stop = tuple(get_stop_words('en'))
p_stemmer = PorterStemmer()


# ตัดคำ
def split_word(text):
    # ตัดคำโดยใช้ dict ใน corpus ที่ผม edit ไป มันจะตัดเฉพาะเมนูอาหารที่ผมใส่ไปใน words.th.txt
    tokens = word_tokenize(text, engine='dict')

    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]

    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
Exemple #18
0
# -*- coding: utf-8 -*-
from pythainlp.tokenize import word_tokenize, dict_trie
from pythainlp.corpus import thai_stopwords, thai_words, tnc
from pythainlp.util import normalize
import data
stopwords = list(thai_stopwords())
thaiword = list(thai_words())
#tnc1=[word for word,i in tnc.word_freqs()]
thaiword.remove("กินข้าว")
datadict = dict_trie(
    list(set(data.ccc + thaiword + stopwords + data.conjunctions)))  #+tnc1)))


def wordcut(word):
    global datadict
    return word_tokenize(word, custom_dict=datadict)
Exemple #19
0
from preprocess import normalize_thai_number, normalize_number, remove_markup_tag, normalize_link, normalize_mention 
from preprocess import normalize_email, normalize_laugh, unescape_html, normalize_emoji, extract_hashtag
from preprocess import normalize_hashtag, replace_with_actual_hashtag, tokenize, _return_token

import pickle
from pythainlp.corpus import thai_stopwords
from string import punctuation

stopwords = thai_stopwords()
punctuation += '“” ️'

filename = 'models/tfidf_lr.pkl'
pipeline = pickle.load(open(filename, 'rb'))

def transform(text):
    text = text.lower()
    text = normalize_thai_number(text)
    text = unescape_html(text)
    text = remove_markup_tag(text)
    text = normalize_link(text)
    text = normalize_mention(text)
    text = normalize_email(text)
    text = normalize_laugh(text)
    text = normalize_number(text, place_holder='')
    text = normalize_emoji(text)
    hashtags = extract_hashtag(text)
    text = normalize_hashtag(text, place_holder='')
    tokens = tokenize(text, stopwords=None, punctuation=punctuation)
    tokens = replace_with_actual_hashtag(tokens, hashtags)
    
    return tokens
Exemple #20
0
# Disable Warning
warnings.filterwarnings('ignore')
# List Data test
Model_Test = []
# Dictionary data.
Class_Properties = {}
# Chrome setting (change mode headless and visible).
Chrome_options = True
# When program finish -> Export JSON.
Export_JSON = True
# Scaning qualify class
Qualify = True
# Loading target website success and wait 5 sec.
Delay = 2
# Data stop word (en / th).
data_stopwords = list(stopwords.words('english')) + list(thai_stopwords())
# Pattern Address
address = [
    'average', 'dept', 'size', 'weight', 'height', 'word', 'a', 'div', 'em',
    'h1', 'h2', 'h3', 'h4', 'h5', 'label', 'li', 'p', 'section', 'span',
    'strong', 'td', 'tr', 'ul'
]
stringContent = ""

# Create api with fastapi
app = FastAPI()


# Navigation
@app.get("/")
async def main():
Exemple #21
0
def _is_stopword(word: str) -> bool:  # check thai stopword
    return word in thai_stopwords()
Exemple #22
0
from wordcloud import WordCloud, STOPWORDS
from flask import Flask, request, send_file
import json
import gc
import os


def word_preparing(words):
    '''ทำ list ของ words ให้ unique และทำ word เป็น lower-case'''
    words = set(words)  # drop duplicate word, unique word
    return [word.lower() for word in words]  # to lower case


# get dictionary and stopword corpus
DEFAULT_DICT = list(thai_words())
DEFAULT_STOPWORLS = list(thai_stopwords()) + list(STOPWORDS)

# word preparing
DEFAULT_DICT = word_preparing(DEFAULT_DICT)
DEFAULT_STOPWORLS = word_preparing(DEFAULT_STOPWORLS)

IMAGE_FILE = "wordcloud.png"  # ชื่อไฟล์ที่จะเซฟรูป wordcloud

app = Flask(__name__)


@app.route("/wordcloud", methods=["POST"])
def gen_worldcloud():
    # get text, custom_stopwords and custom_dict
    body = json.loads(request.get_data())
    text = body['text']
def remove_stopwords(sentence):
    words = list(
        filter(lambda word: not word in thai_stopwords(),
               word_tokenize(sentence)))
    return ''.join(words)
Exemple #24
0
    teim text & tokenize
    """
    text = html.unescape(text)
    text = re.sub(r'(\n|\t|\xa0)', ' ', text)
    text = re.sub(r'(\r|\u200b)', '', text)
    text = re.sub(r'\bhttps?://\S*\b', '', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[\'\"‘’“”`\)\(]', '', text)
    return wt(text.strip(' '), keep_whitespace=False)


def cossim(v1, v2) -> float:
    return np.dot(v1, v2) / np.linalg.norm(v1) / np.linalg.norm(v2)


stopwords = corpus.thai_stopwords()


class NewsAnalyze:
    def __init__(self, path: str, publisher: str
                 ):  # publisher: thairath, matichon, dailynews, sanook, nhk
        self.publisher = publisher
        self.path = f'{path}/{publisher}/'
        self.tokenized = sorted(
            glob.glob(self.path + 'tokenized/*.tsv'))  # list of tokenized file

    def tokenize(self, n=5):
        jsons = set(glob.glob(self.path + '*.json'))  # all json files
        tokenized_txt = {
            f.replace('/tokenized/', '/').split('tokenized')[0] + '.json'
            for f in self.tokenized
Exemple #25
0
# -*- coding: utf-8 -*-
from collections import Counter
from typing import Dict, List

from pythainlp.corpus import thai_stopwords

_STOPWORDS = thai_stopwords()


def rank(words: List[str], exclude_stopwords: bool = False) -> Counter:
    """
    Sort words by frequency

    :param list words: a list of words
    :param bool exclude_stopwords: exclude stopwords
    :return: Counter
    """
    if not words:
        return None

    if exclude_stopwords:
        words = [word for word in words if word not in _STOPWORDS]

    return Counter(words)


def find_keyword(word_list: List[str], min_len: int = 3) -> Dict[str, int]:
    """
    :param list word_list: a list of words
    :param int min_len: a mininum length of keywords to look for
    :return: dict
def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()
Exemple #27
0
        vecFromFirst = allCoord - firstPoint
        scalarProduct = np.sum(vecFromFirst *
                               np.matlib.repmat(lineVecNorm, nPoints, 1),
                               axis=1)
        vecFromFirstParallel = np.outer(scalarProduct, lineVecNorm)
        vecToLine = vecFromFirst - vecFromFirstParallel
        distToLine = np.sqrt(np.sum(vecToLine**2, axis=1))
        idxOfBestPoint = np.argmax(distToLine)
        print(f'Optimum number of cluster by Elbow method: {idxOfBestPoint}')
        return idxOfBestPoint
    except:
        print(' cant find best k')


# function for preprocessing
word = thai_stopwords()


def clean_tag(text):
    text = re.sub('<.*?>', '', text)
    return text


def complete_clean(text: str):

    # Table for emoticon
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()
Exemple #29
0
# -*- coding: utf-8 -*-
from collections import Counter
from typing import Dict, List

from pythainlp.corpus import thai_stopwords

_STOPWORDS = thai_stopwords()


def rank(words: List[str], exclude_stopwords: bool = False) -> Counter:
    """
    Count word frequecy given a list of Thai words with an option
    to exclude stopwords.

    :param list words: a list of words
    :param bool exclude_stopwords: If this parameter is set to **True**
                                   to exclude stopwords from counting.
                                   Otherwise, the stopwords will be counted.
                                   By default, `exclude_stopwords`is
                                   set to **False**
    :return: a Counter object representing word frequency from the text
    :rtype: :class:`collections.Counter`

    :Example:

    Include stopwords in counting word frequency::

        from pythainlp.util import rank

        words = ["บันทึก", "เหตุการณ์", " ", "มี", "การ", "บันทึก", \\
        "เป็น", " ", "ลายลักษณ์อักษร"]
import pandas
import time
from progressbar import progressbar
import gc
import codecs

from pythainlp import word_tokenize, Tokenizer
from pythainlp.corpus import thai_stopwords, thai_words
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from flask import Flask, request, send_file, after_this_request, render_template, redirect
import numpy as np
import random
import matplotlib

stop_words = list(thai_stopwords()) + list(STOPWORDS) +\
             ["฿","ly","pic","co","th","https","com","youtu","http","www","twitter","html","bit"]
map(lambda stop_words: stop_words.lower(), stop_words)

pythainlp_words = thai_words()
custom_dict = [
    'โคโรนา', 'ลุงตู่', 'โควิด', 'โคโรน่า', 'เจลล้างมือ', 'ขบวนเสด็จ'
]
dictionary = list(pythainlp_words) + list(custom_dict)

tok = Tokenizer(dictionary)


class main_flask():
    app = Flask(__name__)
Exemple #31
0
from nltk import NaiveBayesClassifier as nbc
import pickle
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
import codecs
from itertools import chain

a = thai_stopwords()
# pos.txt
with codecs.open('pos.txt', 'r', "utf-8") as f:
    lines = f.readlines()
listpos=[e.strip() for e in lines]
del lines
f.close() # ปิดไฟล์
# neg.txt
with codecs.open('neg.txt', 'r', "utf-8") as f:
    lines = f.readlines()
listneg=[e.strip() for e in lines]
f.close() # ปิดไฟล์

pos1=['pos']*len(listpos)
neg1=['neg']*len(listneg)

training_data = list(zip(listpos,pos1)) + list(zip(listneg,neg1))

vocabulary = set(chain(*[(set(word_tokenize(i[0]))-set(thai_stopwords())) for i in training_data]))
#vocabulary = set(chain(*[x for x in a if x not in [list(set(word_tokenize(i[0]))) for i in training_data]]))

feature_set = [({i:(i in word_tokenize(sentence)) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)
Exemple #32
0
from pythainlu.intent_classification import naive_bayes, MultinomialNB
from pythainlp.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

file = "../dataset/data-nottag.set"
import pandas as pd

colnames = ['text', 'tag']

from pythainlp.tokenize import word_tokenize, Trie
from pythainlp.corpus import thai_stopwords
# ...
#filtered_words = [word for word in word_list if word not in list(thai_stopwords())]
o = Trie(list(thai_stopwords()))


def filtered_words(x: tuple):
    #w = .lower()#word_tokenize(x[0],custom_dict=o)
    #ww = [word for word in w if word not in list(thai_stopwords())]
    return (x[0].lower().strip(), x[1])  #(''.join(ww),x[1])


user1 = pd.read_csv(file, names=colnames, header=None, sep="|")
data = [filtered_words(tuple(x)) for x in user1.to_records(index=False)]


def features(text):
    wordlist = word_tokenize(text)
    f = {}
    if "แจ้งเตือน" in text or 'เตือน' in text:
        f['a'] = True