Ejemplo n.º 1
0
def createBOW(ls_txt, corpus):

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    BOW_t = [list() for i in range(len(ls_txt))]
    l = 0
    for i in ls_txt:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW_t[l].append(tmp.count(j))
                tmp.remove(j)
            else:
                BOW_t[l].append(0)

        if len(tmp) != 0:
            BOW_t[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW_t[l].append(0)
        l += 1

    # corpus_t = corpus.append('Other')
    # ch = pd.DataFrame({
    #     'train':corpus,
    #     'target':BOW_t[0]
    # })
    # ch
    # predictiontree = dtree.predict(BOW_t)
    return list(BOW_t)
Ejemplo n.º 2
0
def tokenize_text_list_test(ls):

    print("working on")
    li = [
        'cfcut', 'deepcut', 'etcc', 'longest', 'multi_cut', 'newmm', 'ssg',
        'tcc', 'trie'
    ]
    # li=['cfcut','newmm']
    custom_dict = set(thai_words())
    trie = dict_trie(dict_source=custom_dict)
    p, q = [], []
    for x in li:
        start = time.process_time()
        if x == 'deepcut':
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l, engine=x) for l in ls
                ]))
        else:
            g = list(
                chain.from_iterable([
                    pythainlp.tokenize.word_tokenize(l,
                                                     engine=x,
                                                     custom_dict=trie)
                    for l in ls
                ]))
        p.append(g)
        # print(g)
        tim = time.process_time() - start
        q.append(tim)
    return p, q
Ejemplo n.º 3
0
    def __init__(
        self,
        max_len: '(int) max number of tokens per sample',
        min_len: '(int) min number of tokens per sample',
        min_len_character:
        '(int) min number of characters per token' = 1,  #recommend > 0 or 1 to automatically clear white spaces and error from the tokenizer 
        #  do_padding: '(bool) use "-PAD-" to pad the sentenses until their length is equal to max_len' = False,
        #  return_mask: '(bool) if True also return list of booleans indicating where the -PAD- is (True for real tokens and False for -PAD- token)' = False,
        #  rules_before_tokenization: '(Collection[function(str)]) Collection of functions taking sentence-level input string' = None,
        #  rules_after_tokenization: '(Collection[function(list[str])]) Collection of functions taking list of tokens' = None,
        #  stopwords: '(set[string]) set of stopwords' = {},
        engine:
        '(str) engine used to tokenize sentences see: https://thainlp.org/pythainlp/docs/2.0/api/tokenize.html' = 'newmm',
        verbose:
        '(bool) if True print some comparisons of before and after processing texts' = False
        #  additional_words: '(Collection[str]) Collection of words to "add" into the dictionary **ducplicated words will be eliminated automatically**' = {},
        #  unwanted_words: '(Collection[str]) Collection of words to "remove" into the dictionary **ducplicated words will be eliminated automatically**' = {}
    ):
        # Define rules_before_tokenization and rules_after_tokenization carefully (the order is important!!)
        self.max_len = max_len
        self.min_len = min_len
        self.min_len_character = min_len_character
        self.rules_after_tokenization = rules_after_tokenization
        self.rules_before_tokenization = rules_before_tokenization
        self.tfxidf_obj = None  # to make it we need to call visualize_important_words() once
        # self.stopwords = stopwords
        self.engine = engine
        # self.do_padding = do_padding
        self.verbose = verbose
        # self.return_mask = return_mask

        #you can freely define additional words, unwanted words and stopwords using 1 word per line in each corresponding file

        additional_words = set()
        with open('./word_configs/additional_words.txt', 'r',
                  encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    additional_words.add(line)
        #print(f'additional_words: {additional_words}')
        unwanted_words = set()
        with open('./word_configs/unwanted_words.txt', 'r',
                  encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    unwanted_words.add(line)
        #print(f'unwanted_words: {unwanted_words}')
        self.stopwords = set()
        with open('./word_configs/stopwords.txt', 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line != '':
                    self.stopwords.add(line)
        #print(f'self.stopwords: {self.stopwords}')
        self.dictionary = pythainlp.tokenize.dict_trie(
            set(thai_words()).union(set(additional_words)).difference(
                set(unwanted_words)))
Ejemplo n.º 4
0
def main():
    engineOption = ["newmm", "longest-matching", "dict", "ulmfit"]
    f = codecs.open('input.txt', encoding='utf-8')
    fsort = open("output-sort.csv", "w", encoding="utf-8")

    text = ""
    for line in f:
        # print (line)
        text = text + line

    custom_words_list = set(thai_words())
    custom_words_list.add('รีเทนเนอร์')
    custom_words_list.add('จัดฟัน')
    custom_words_list.add('ฟันชิด')
    trie = dict_trie(dict_source=custom_words_list)
    _tokenizer = Tokenizer(custom_dict=trie, engine='newmm')

    print('------ Starting to tokenize words ------')
    # words = word_tokenize(text, engine=engineOption[0])
    words = _tokenizer.word_tokenize(text)
    i = 0
    wordsNew = ""
    for word in words:
        if word and (not word.isspace(
        )) and word != '-' and word != '/' and not word.isnumeric():
            i = i + 1
            # print(i , ': ' , word.strip() )
            wordsNew = wordsNew + word.strip() + " "
    f.close()

    print('------ Starting to count words: ------')
    wordlist = wordsNew.split()
    wordfreq = []
    for w in wordlist:
        wordfreq.append(wordlist.count(w.strip()))
        dictionary = wordListToFreqDict(wordlist)
        sorteddict = sortFreqDict(dictionary)
        i = i + 1
        if (i % 150 == 0):
            print(".")
        else:
            print(".", end='')

    print('------ Starting to sort words and write to file ------')
    for s in sorteddict:
        print(s[1], "|", s[0])
        fsort.write(s[1] + "|" + str(s[0]))
        fsort.write('\n')
    fsort.close()
Ejemplo n.º 5
0
def train():
    df = pd.read_csv("Data/Expenses.csv")

    custom_dict = set(thai_words())
    word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย']
    for i in word:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)

    corpus = []
    for i in df.text:
        for j in word_tokenize(i, engine='dict', custom_dict=trie):
            if j not in corpus:
                corpus.append(j)

    BOW = [list() for i in range(len(df.text))]
    l = 0
    count = 1
    for i in df.text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1

    ytarget = df.cate
    xtrain = BOW

    dtree = DecisionTreeClassifier()
    dtree.fit(X=xtrain, y=ytarget)

    return corpus, dtree
Ejemplo n.º 6
0
    def __init__(self):
        # ----------------------NLP thai------------------------------------
        # stopwords_th.txt
        with codecs.open("dataset/stopwords_th.txt", "r") as f:
            lines = f.readlines()
        listpos=[e.strip() for e in lines]
        del lines
        f.close() # ปิดไฟล์

        self.stopwords_thai = listpos

        modul=self.loadData("Modul")

        self.classifier = modul[0]
        self.vocabulary = modul[1]

        # คำไทย
        """read = open("dataset/thai_words.txt", "r")
        words = []
        add_words = set(thai_words())  # thai_words() returns frozenset
        for m in read:
            add_words.add(m.split("\n")[0])"""
        self.custom_tokenizer = dict_trie(thai_words())
        # ------------------------------------------------------------------

        # ----------------------NLP english------------------------------------
        self.nlp = spacy.load("en_core_web_md")

        self.sia = SentimentIntensityAnalyzer()

        self.STOP_WORD_1 = self.nlp.Defaults.stop_words # stop word ของ spacy

        self.STOP_WORD_2 = stopwords.words('english') # stop word ของ nltk
        self.STOP_WORD_3 = STOP_WORDS # stop word ของ spacy
        # ------------------------------------------------------------------

        # ----------------------detector--------------------------------------------
        self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        # --------------------------------------------------------------------------

        self.dict_vocab = {i:False for i in self.vocabulary}
Ejemplo n.º 7
0
    def ProcessText(self, text):

        dataBase = database()
        streetDf, addressDf = dataBase.ReadStreetName()

        streetList = dataBase.DataframeToList(streetDf)
        addressList = dataBase.DataframeToList(addressDf)
        districtList = dataBase.districtName_
        wordList = districtList + streetList + addressList

        custom_words_list = set(thai_words())
        custom_words_list.update(wordList)
        custom_words_list.update(self.specWord)

        trie = dict_trie(dict_source=custom_words_list)

        custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel)
        proc = custom_tokenizer.word_tokenize(text)

        cleanList_1 = []
        cleanList = []
        [
            cleanList_1.append(
                i.translate(str.maketrans('', '', string.punctuation)))
            for i in proc
        ]
        [
            cleanList.append(i.translate(str.maketrans('', '', '1234567890')))
            for i in cleanList_1
        ]

        procText = list(filter(lambda x: x != " ", proc))
        procText = list(filter(lambda x: x != "  ", procText))
        procText = list(filter(lambda x: x != "", procText))
        #procText = list(filter(lambda x: len(x)>2, procText))
        joinText = ' '.join(procText)
        #print(joinText)
        return joinText
    'non dairy', '7 eleven', 'เซเว่น อีเลฟเว่น', 'เซเว่นอีเลฟเว่น', 'เซเว่น',
    'เซเวน', '7 11', 'สตาร์บัค', 'อเมซอน', 'ท็อปส์', 'ทอปส์', 'ท้อปส์',
    'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place',
    'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม',
    'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้',
    'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท',
    'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม',
    'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ',
    'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย',
    'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์',
    'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน',
    'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น',
    'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า',
    'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน'
]
words = set(thai_words()).union(set(custom_list))
_trie = dict_trie(dict_source=words)
_tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE)

########################################################


def _is_stopword(word: str) -> bool:  # เช็คว่าเป็นคำฟุ่มเฟือย
    return word in thai_stopwords()


def _doc2features(doc, i) -> dict:
    word = doc[i][0]
    postag = doc[i][1]

    # Features from current word
Ejemplo n.º 9
0
from pythainlp import *
from pythainlp.tag.named_entity import ThaiNameTagger
from pythainlp.corpus.common import thai_words
from pythainlp.util import dict_trie
from decouple import config

newWords = ["ไม่ดี", "ไม่พอใจ", "ชั่วคราว"]
custom_words_list = set(thai_words())
custom_words_list.update(newWords)
trie = dict_trie(dict_source=custom_words_list)
custom_tokenizer = Tokenizer(custom_dict=trie,
                             engine='newmm',
                             keep_whitespace=False)


class NLP:
    def __init__(self):
        self.positive_words = []
        self.negative_words = []
        self.swear_words = []
        self.check_words = []
        self.food_words = []
        self.spa_words = []
        self.beauty_words = []
        self.travel_words = []
        self.health_words = []

        with open(config("NEGATIVE_SENTIMENT_WORDS"), 'r',
                  encoding='utf-8') as f:
            for line in f:
                self.negative_words.append(line.rstrip())
Ejemplo n.º 10
0
    '''
        Class: Keyword
        Purpose: Contains data of keyword
    '''
    def __init__(self, word, weight):
        self.word = word  # Type: String
        self.weight = weight  # Type: Integer


class TreeNode():
    def __init__(self, data):
        self.data = data  # Type: <Dynamic>
        self.children = list()  # Type: List<Dynamic>


THAI_WORDS = set(thai_words())
for i in open('requirement/data/custom_tokenizer.txt', encoding="utf-8"):
    THAI_WORDS.add(i.replace('\n', '').strip())

TOKENIZER = Tokenizer(THAI_WORDS)

KEYWORDS_HIGH_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-high.csv')).tolist()
]

KEYWORDS_MEDIUM_PRIORITY = [
    Keyword(i[0], int(i[1])) for i in array(
        read_csv('requirement/data/keywords/priority-medium.csv')).tolist()
]
Ejemplo n.º 11
0
def Processing(E1):

    p_stemmer = PorterStemmer()

    ThaiWord = list(thaisw.words('thai'))
    #print(' Thaiwords : ', ThaiWord)
    EngWord = list(set(engsw.words('english')))
    #print(' ew : ',EngWord, ' : ', type(EngWord))
    Morewords = [
        u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ',
        u'test', u'.', u',', u'ทำ', u'-', u'/'
    ]
    All_Stop_Word = ThaiWord + EngWord + Morewords
    #print(' ALL : ',All_Stop_Word)

    EntryList = []
    for n in E1:
        # check=detect(n[0])   # th or en
        #print(' text : ', n[0], ' :: ',check)
        EntryList.append(n[0])

        #print(' EntryList : ', EntryList)

        Outcome = []
    for r in EntryList:
        Dummy = []
        tokens = []
        tokens = list(eng_tokens(r))
        lowered = [t.lower() for t in tokens]
        #print(' Dummy : ',lowered)
        lowered = " ".join(lowered)
        #Dummy=list(thai_tokens(lowered, engine='newmm'))
        words = set(thai_words())
        words.add(u'ไทยเบฟ')
        words.add(u'ผสานพลัง')
        words.add(u'โอกาส')
        words.add(u'ถังไม้โอ๊ค')
        custom_tokenizer = Tokenizer(words)
        Dummy = list(custom_tokenizer.word_tokenize(lowered))
        #print(' Dummy 2 : ',Dummy)
        Outcome.append(Dummy)

    #print(' Outcome : ',Outcome, ' : ', len(Outcome))

    NoStop = []
    for n in Outcome:
        Dummy = []
        Dummy = [word for word in n if word not in All_Stop_Word]
        NoStop.append(Dummy)

    print(' No stop : ', NoStop, ' len: ', len(NoStop))

    Lemma = []
    for n in NoStop:
        Dummy = []
        Dummy = [p_stemmer.stem(word) for word in n]
        Lemma.append(Dummy)

    print(' Lemma : ', Lemma, ' len: ', len(Lemma))
    '''
    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatize all tokens into a new list: lemmatized
    Lemma=[]
    for n in NoStop:
        Dummy=[]
        Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n]
        Lemma.append(Dummy)
    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    '''

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        for i in n:
            w_syn = wordnet.synsets(i)
            if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0):
                Dummy.append(w_syn[0].lemma_names('tha')[0])
            else:
                Dummy.append(i)
        Lemma_temp.append(Dummy)

    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not i.isnumeric()]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    Lemma_temp = []
    for n in Lemma:
        Dummy = []
        Dummy = [i for i in n if not ' ' in i]
        Lemma_temp.append(Dummy)
    Lemma = Lemma_temp

    #print(' lemma : ', Lemma, '  ::  ', type(Lemma))
    return Lemma