Python thai_words Beispiele, pythainlp.corpus.common.thai_words Python Beispiele

Beispiel #1

0

Datei anzeigen

def createBOW(ls_txt, corpus):

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    BOW_t = [list() for i in range(len(ls_txt))]
    l = 0
    for i in ls_txt:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW_t[l].append(tmp.count(j))
                tmp.remove(j)
            else:
                BOW_t[l].append(0)

        if len(tmp) != 0:
            BOW_t[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW_t[l].append(0)
        l += 1

    # corpus_t = corpus.append('Other')
    # ch = pd.DataFrame({
    #     'train':corpus,
    #     'target':BOW_t[0]
    # })
    # ch
    # predictiontree = dtree.predict(BOW_t)
    return list(BOW_t)

Beispiel #2

0

Datei anzeigen

def tokenize_text_list_test(ls):

    print("working on")
    li = [
        'cfcut', 'deepcut', 'etcc', 'longest', 'multi_cut', 'newmm', 'ssg',
        'tcc', 'trie'
    ]
    # li=['cfcut','newmm']
    custom_dict = set(thai_words())
    trie = dict_trie(dict_source=custom_dict)
    p, q = [], []
    for x in li:
        start = time.process_time()
        if x == 'deepcut':
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l, engine=x) for l in ls
                ]))
        else:
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l,
                                                     engine=x,
                                                     custom_dict=trie)
                    for l in ls
                ]))
        p.append(g)
        # print(g)
        tim = time.process_time() - start
        q.append(tim)
    return p, q

Beispiel #3

0

Datei anzeigen

Datei: preprocessing.py Projekt: Kamin-At/Thai_NLP

    def __init__(
        self,
        max_len: '(int) max number of tokens per sample',
        min_len: '(int) min number of tokens per sample',
        min_len_character:
        '(int) min number of characters per token' = 1,  #recommend > 0 or 1 to automatically clear white spaces and error from the tokenizer 
        #  do_padding: '(bool) use "-PAD-" to pad the sentenses until their length is equal to max_len' = False,
        #  return_mask: '(bool) if True also return list of booleans indicating where the -PAD- is (True for real tokens and False for -PAD- token)' = False,
        #  rules_before_tokenization: '(Collection[function(str)]) Collection of functions taking sentence-level input string' = None,
        #  rules_after_tokenization: '(Collection[function(list[str])]) Collection of functions taking list of tokens' = None,
        #  stopwords: '(set[string]) set of stopwords' = {},
        engine:
        '(str) engine used to tokenize sentences see: https://thainlp.org/pythainlp/docs/2.0/api/tokenize.html' = 'newmm',
        verbose:
        '(bool) if True print some comparisons of before and after processing texts' = False
        #  additional_words: '(Collection[str]) Collection of words to "add" into the dictionary **ducplicated words will be eliminated automatically**' = {},
        #  unwanted_words: '(Collection[str]) Collection of words to "remove" into the dictionary **ducplicated words will be eliminated automatically**' = {}
    ):
        # Define rules_before_tokenization and rules_after_tokenization carefully (the order is important!!)
        self.max_len = max_len
        self.min_len = min_len
        self.min_len_character = min_len_character
        self.rules_after_tokenization = rules_after_tokenization
        self.rules_before_tokenization = rules_before_tokenization
        self.tfxidf_obj = None  # to make it we need to call visualize_important_words() once
        # self.stopwords = stopwords
        self.engine = engine
        # self.do_padding = do_padding
        self.verbose = verbose
        # self.return_mask = return_mask

        #you can freely define additional words, unwanted words and stopwords using 1 word per line in each corresponding file

        additional_words = set()
        with open('./word_configs/additional_words.txt', 'r',
                  encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    additional_words.add(line)
        #print(f'additional_words: {additional_words}')
        unwanted_words = set()
        with open('./word_configs/unwanted_words.txt', 'r',
                  encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    unwanted_words.add(line)
        #print(f'unwanted_words: {unwanted_words}')
        self.stopwords = set()
        with open('./word_configs/stopwords.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    self.stopwords.add(line)
        #print(f'self.stopwords: {self.stopwords}')
        self.dictionary = pythainlp.tokenize.dict_trie(
            set(thai_words()).union(set(additional_words)).difference(
                set(unwanted_words)))

Beispiel #4

0

Datei anzeigen

Datei: main_new.py Projekt: wwarodom/scrap_extract

def main():
    engineOption = ["newmm", "longest-matching", "dict", "ulmfit"]
    f = codecs.open('input.txt', encoding='utf-8')
    fsort = open("output-sort.csv", "w", encoding="utf-8")

    text = ""
    for line in f:
        # print (line)
        text = text + line

    custom_words_list = set(thai_words())
    custom_words_list.add('รีเทนเนอร์')
    custom_words_list.add('จัดฟัน')
    custom_words_list.add('ฟันชิด')
    trie = dict_trie(dict_source=custom_words_list)
    _tokenizer = Tokenizer(custom_dict=trie, engine='newmm')

    print('------ Starting to tokenize words ------')
    # words = word_tokenize(text, engine=engineOption[0])
    words = _tokenizer.word_tokenize(text)
    i = 0
    wordsNew = ""
    for word in words:
        if word and (not word.isspace(
        )) and word != '-' and word != '/' and not word.isnumeric():
            i = i + 1
            # print(i , ': ' , word.strip() )
            wordsNew = wordsNew + word.strip() + " "
    f.close()

    print('------ Starting to count words: ------')
    wordlist = wordsNew.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w.strip()))
        dictionary = wordListToFreqDict(wordlist)
        sorteddict = sortFreqDict(dictionary)
        i = i + 1
        if (i % 150 == 0):
            print(".")
        else:
            print(".", end='')

    print('------ Starting to sort words and write to file ------')
    for s in sorteddict:
        print(s[1], "|", s[0])
        fsort.write(s[1] + "|" + str(s[0]))
        fsort.write('\n')
    fsort.close()

Beispiel #5

0

Datei anzeigen

def train():
    df = pd.read_csv("Data/Expenses.csv")

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    corpus = []
    for i in df.text:
        for j in word_tokenize(i, engine='dict', custom_dict=trie):
            if j not in corpus:
                corpus.append(j)

    BOW = [list() for i in range(len(df.text))]
    l = 0
    count = 1
    for i in df.text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1

    ytarget = df.cate
    xtrain = BOW

    dtree = DecisionTreeClassifier()
    dtree.fit(X=xtrain, y=ytarget)

    return corpus, dtree

Beispiel #6

0

Datei anzeigen

Datei: NLP_4test.py Projekt: AndaChain/NewsTweet

    def __init__(self):
        # ----------------------NLP thai------------------------------------
        # stopwords_th.txt
        with codecs.open("dataset/stopwords_th.txt", "r") as f:
            lines = f.readlines()
        listpos=[e.strip() for e in lines]
        del lines
        f.close() # ปิดไฟล์

        self.stopwords_thai = listpos

        modul=self.loadData("Modul")

        self.classifier = modul[0]
        self.vocabulary = modul[1]

        # คำไทย
        """read = open("dataset/thai_words.txt", "r")
        words = []
        add_words = set(thai_words())  # thai_words() returns frozenset
        for m in read:
            add_words.add(m.split("\n")[0])"""
        self.custom_tokenizer = dict_trie(thai_words())
        # ------------------------------------------------------------------

        # ----------------------NLP english------------------------------------
        self.nlp = spacy.load("en_core_web_md")

        self.sia = SentimentIntensityAnalyzer()

        self.STOP_WORD_1 = self.nlp.Defaults.stop_words # stop word ของ spacy

        self.STOP_WORD_2 = stopwords.words('english') # stop word ของ nltk
        self.STOP_WORD_3 = STOP_WORDS # stop word ของ spacy
        # ------------------------------------------------------------------

        # ----------------------detector--------------------------------------------
        self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        # --------------------------------------------------------------------------

        self.dict_vocab = {i:False for i in self.vocabulary}

Beispiel #7

0

Datei anzeigen

Datei: nlp_process.py Projekt: hkbtotw/traffic-monitor

    def ProcessText(self, text):

        dataBase = database()
        streetDf, addressDf = dataBase.ReadStreetName()

        streetList = dataBase.DataframeToList(streetDf)
        addressList = dataBase.DataframeToList(addressDf)
        districtList = dataBase.districtName_
        wordList = districtList + streetList + addressList

        custom_words_list = set(thai_words())
        custom_words_list.update(wordList)
        custom_words_list.update(self.specWord)

        trie = dict_trie(dict_source=custom_words_list)

        custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel)
        proc = custom_tokenizer.word_tokenize(text)

        cleanList_1 = []
        cleanList = []
        [
            cleanList_1.append(
                i.translate(str.maketrans('', '', string.punctuation)))
            for i in proc
        ]
        [
            cleanList.append(i.translate(str.maketrans('', '', '1234567890')))
            for i in cleanList_1
        ]

        procText = list(filter(lambda x: x != " ", proc))
        procText = list(filter(lambda x: x != "  ", procText))
        procText = list(filter(lambda x: x != "", procText))
        #procText = list(filter(lambda x: len(x)>2, procText))
        joinText = ' '.join(procText)
        #print(joinText)
        return joinText

Beispiel #8

0

Datei anzeigen

Datei: named_entity.py Projekt: pacharapol4066/Product-Position-SNA

    'non dairy', '7 eleven', 'เซเว่น อีเลฟเว่น', 'เซเว่นอีเลฟเว่น', 'เซเว่น',
    'เซเวน', '7 11', 'สตาร์บัค', 'อเมซอน', 'ท็อปส์', 'ทอปส์', 'ท้อปส์',
    'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place',
    'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม',
    'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้',
    'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท',
    'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม',
    'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ',
    'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย',
    'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์',
    'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน',
    'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น',
    'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า',
    'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน'
]
words = set(thai_words()).union(set(custom_list))
_trie = dict_trie(dict_source=words)
_tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE)

########################################################


def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()


def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word

Beispiel #9

0

Datei anzeigen

Datei: nlp.py Projekt: parinh/pythongetpostshopee

from pythainlp import *
from pythainlp.tag.named_entity import ThaiNameTagger
from pythainlp.corpus.common import thai_words
from pythainlp.util import dict_trie
from decouple import config

newWords = ["ไม่ดี", "ไม่พอใจ", "ชั่วคราว"]
custom_words_list = set(thai_words())
custom_words_list.update(newWords)
trie = dict_trie(dict_source=custom_words_list)
custom_tokenizer = Tokenizer(custom_dict=trie,
                             engine='newmm',
                             keep_whitespace=False)


class NLP:
    def __init__(self):
        self.positive_words = []
        self.negative_words = []
        self.swear_words = []
        self.check_words = []
        self.food_words = []
        self.spa_words = []
        self.beauty_words = []
        self.travel_words = []
        self.health_words = []

        with open(config("NEGATIVE_SENTIMENT_WORDS"), 'r',
                  encoding='utf-8') as f:
            for line in f:
                self.negative_words.append(line.rstrip())

Beispiel #10

0

Datei anzeigen

Datei: analysis.py Projekt: yakung/requirement-classification

    '''
        Class: Keyword
        Purpose: Contains data of keyword
    '''
    def __init__(self, word, weight):
        self.word = word  # Type: String
        self.weight = weight  # Type: Integer


class TreeNode():
    def __init__(self, data):
        self.data = data  # Type: <Dynamic>
        self.children = list()  # Type: List<Dynamic>


THAI_WORDS = set(thai_words())
for i in open('requirement/data/custom_tokenizer.txt', encoding="utf-8"):
    THAI_WORDS.add(i.replace('\n', '').strip())

TOKENIZER = Tokenizer(THAI_WORDS)

KEYWORDS_HIGH_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-high.csv')).tolist()
]

KEYWORDS_MEDIUM_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-medium.csv')).tolist()
]

Beispiel #11

0

Datei anzeigen

def Processing(E1):

    p_stemmer = PorterStemmer()

    ThaiWord = list(thaisw.words('thai'))
    #print(' Thaiwords : ', ThaiWord)
    EngWord = list(set(engsw.words('english')))
    #print(' ew : ',EngWord, ' : ', type(EngWord))
    Morewords = [
        u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ',
        u'test', u'.', u',', u'ทำ', u'-', u'/'
    ]
    All_Stop_Word = ThaiWord + EngWord + Morewords
    #print(' ALL : ',All_Stop_Word)

    EntryList = []
    for n in E1:
        # check=detect(n[0])   # th or en
        #print(' text : ', n[0], ' :: ',check)
        EntryList.append(n[0])

        #print(' EntryList : ', EntryList)

        Outcome = []
    for r in EntryList:
        Dummy = []
        tokens = []
        tokens = list(eng_tokens(r))
        lowered = [t.lower() for t in tokens]
        #print(' Dummy : ',lowered)
        lowered = " ".join(lowered)
        #Dummy=list(thai_tokens(lowered, engine='newmm'))
        words = set(thai_words())
        words.add(u'ไทยเบฟ')
        words.add(u'ผสานพลัง')
        words.add(u'โอกาส')
        words.add(u'ถังไม้โอ๊ค')
        custom_tokenizer = Tokenizer(words)
        Dummy = list(custom_tokenizer.word_tokenize(lowered))
        #print(' Dummy 2 : ',Dummy)
        Outcome.append(Dummy)

    #print(' Outcome : ',Outcome, ' : ', len(Outcome))

    NoStop = []
    for n in Outcome:
        Dummy = []
        Dummy = [word for word in n if word not in All_Stop_Word]
        NoStop.append(Dummy)

    print(' No stop : ', NoStop, ' len: ', len(NoStop))

    Lemma = []
    for n in NoStop:
        Dummy = []
        Dummy = [p_stemmer.stem(word) for word in n]
        Lemma.append(Dummy)

    print(' Lemma : ', Lemma, ' len: ', len(Lemma))
    '''
    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatize all tokens into a new list: lemmatized
    Lemma=[]
    for n in NoStop:
        Dummy=[]
        Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n]
        Lemma.append(Dummy)
    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    '''

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        for i in n:
            w_syn = wordnet.synsets(i)
            if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
                Dummy.append(w_syn[0].lemma_names('tha')[0])
            else:
                Dummy.append(i)
        Lemma_temp.append(Dummy)

    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not i.isnumeric()]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not ' ' in i]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    return Lemma