def createBOW(ls_txt, corpus): custom_dict = set(thai_words()) word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย'] for i in word: custom_dict.add(i) trie = dict_trie(dict_source=custom_dict) BOW_t = [list() for i in range(len(ls_txt))] l = 0 for i in ls_txt: tmp = word_tokenize(i, engine='dict', custom_dict=trie) for j in corpus: if j in tmp: BOW_t[l].append(tmp.count(j)) tmp.remove(j) else: BOW_t[l].append(0) if len(tmp) != 0: BOW_t[l].append(len(tmp)) elif len(tmp) == 0: BOW_t[l].append(0) l += 1 # corpus_t = corpus.append('Other') # ch = pd.DataFrame({ # 'train':corpus, # 'target':BOW_t[0] # }) # ch # predictiontree = dtree.predict(BOW_t) return list(BOW_t)
def tokenize_text_list_test(ls): print("working on") li = [ 'cfcut', 'deepcut', 'etcc', 'longest', 'multi_cut', 'newmm', 'ssg', 'tcc', 'trie' ] # li=['cfcut','newmm'] custom_dict = set(thai_words()) trie = dict_trie(dict_source=custom_dict) p, q = [], [] for x in li: start = time.process_time() if x == 'deepcut': g = list( chain.from_iterable([ pythainlp.tokenize.word_tokenize(l, engine=x) for l in ls ])) else: g = list( chain.from_iterable([ pythainlp.tokenize.word_tokenize(l, engine=x, custom_dict=trie) for l in ls ])) p.append(g) # print(g) tim = time.process_time() - start q.append(tim) return p, q
def __init__( self, max_len: '(int) max number of tokens per sample', min_len: '(int) min number of tokens per sample', min_len_character: '(int) min number of characters per token' = 1, #recommend > 0 or 1 to automatically clear white spaces and error from the tokenizer # do_padding: '(bool) use "-PAD-" to pad the sentenses until their length is equal to max_len' = False, # return_mask: '(bool) if True also return list of booleans indicating where the -PAD- is (True for real tokens and False for -PAD- token)' = False, # rules_before_tokenization: '(Collection[function(str)]) Collection of functions taking sentence-level input string' = None, # rules_after_tokenization: '(Collection[function(list[str])]) Collection of functions taking list of tokens' = None, # stopwords: '(set[string]) set of stopwords' = {}, engine: '(str) engine used to tokenize sentences see: https://thainlp.org/pythainlp/docs/2.0/api/tokenize.html' = 'newmm', verbose: '(bool) if True print some comparisons of before and after processing texts' = False # additional_words: '(Collection[str]) Collection of words to "add" into the dictionary **ducplicated words will be eliminated automatically**' = {}, # unwanted_words: '(Collection[str]) Collection of words to "remove" into the dictionary **ducplicated words will be eliminated automatically**' = {} ): # Define rules_before_tokenization and rules_after_tokenization carefully (the order is important!!) self.max_len = max_len self.min_len = min_len self.min_len_character = min_len_character self.rules_after_tokenization = rules_after_tokenization self.rules_before_tokenization = rules_before_tokenization self.tfxidf_obj = None # to make it we need to call visualize_important_words() once # self.stopwords = stopwords self.engine = engine # self.do_padding = do_padding self.verbose = verbose # self.return_mask = return_mask #you can freely define additional words, unwanted words and stopwords using 1 word per line in each corresponding file additional_words = set() with open('./word_configs/additional_words.txt', 'r', encoding='utf8') as f: for line in f: line = line.strip() if line != '': additional_words.add(line) #print(f'additional_words: {additional_words}') unwanted_words = set() with open('./word_configs/unwanted_words.txt', 'r', encoding='utf8') as f: for line in f: line = line.strip() if line != '': unwanted_words.add(line) #print(f'unwanted_words: {unwanted_words}') self.stopwords = set() with open('./word_configs/stopwords.txt', 'r', encoding='utf8') as f: for line in f: line = line.strip() if line != '': self.stopwords.add(line) #print(f'self.stopwords: {self.stopwords}') self.dictionary = pythainlp.tokenize.dict_trie( set(thai_words()).union(set(additional_words)).difference( set(unwanted_words)))
def main(): engineOption = ["newmm", "longest-matching", "dict", "ulmfit"] f = codecs.open('input.txt', encoding='utf-8') fsort = open("output-sort.csv", "w", encoding="utf-8") text = "" for line in f: # print (line) text = text + line custom_words_list = set(thai_words()) custom_words_list.add('รีเทนเนอร์') custom_words_list.add('จัดฟัน') custom_words_list.add('ฟันชิด') trie = dict_trie(dict_source=custom_words_list) _tokenizer = Tokenizer(custom_dict=trie, engine='newmm') print('------ Starting to tokenize words ------') # words = word_tokenize(text, engine=engineOption[0]) words = _tokenizer.word_tokenize(text) i = 0 wordsNew = "" for word in words: if word and (not word.isspace( )) and word != '-' and word != '/' and not word.isnumeric(): i = i + 1 # print(i , ': ' , word.strip() ) wordsNew = wordsNew + word.strip() + " " f.close() print('------ Starting to count words: ------') wordlist = wordsNew.split() wordfreq = [] for w in wordlist: wordfreq.append(wordlist.count(w.strip())) dictionary = wordListToFreqDict(wordlist) sorteddict = sortFreqDict(dictionary) i = i + 1 if (i % 150 == 0): print(".") else: print(".", end='') print('------ Starting to sort words and write to file ------') for s in sorteddict: print(s[1], "|", s[0]) fsort.write(s[1] + "|" + str(s[0])) fsort.write('\n') fsort.close()
def train(): df = pd.read_csv("Data/Expenses.csv") custom_dict = set(thai_words()) word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย'] for i in word: custom_dict.add(i) trie = dict_trie(dict_source=custom_dict) corpus = [] for i in df.text: for j in word_tokenize(i, engine='dict', custom_dict=trie): if j not in corpus: corpus.append(j) BOW = [list() for i in range(len(df.text))] l = 0 count = 1 for i in df.text: tmp = word_tokenize(i, engine='dict', custom_dict=trie) for j in corpus: if j in tmp: BOW[l].append(tmp.count(j)) tmp.remove(j) else: BOW[l].append(0) if len(tmp) != 0: BOW[l].append(len(tmp)) elif len(tmp) == 0: BOW[l].append(0) l += 1 ytarget = df.cate xtrain = BOW dtree = DecisionTreeClassifier() dtree.fit(X=xtrain, y=ytarget) return corpus, dtree
def __init__(self): # ----------------------NLP thai------------------------------------ # stopwords_th.txt with codecs.open("dataset/stopwords_th.txt", "r") as f: lines = f.readlines() listpos=[e.strip() for e in lines] del lines f.close() # ปิดไฟล์ self.stopwords_thai = listpos modul=self.loadData("Modul") self.classifier = modul[0] self.vocabulary = modul[1] # คำไทย """read = open("dataset/thai_words.txt", "r") words = [] add_words = set(thai_words()) # thai_words() returns frozenset for m in read: add_words.add(m.split("\n")[0])""" self.custom_tokenizer = dict_trie(thai_words()) # ------------------------------------------------------------------ # ----------------------NLP english------------------------------------ self.nlp = spacy.load("en_core_web_md") self.sia = SentimentIntensityAnalyzer() self.STOP_WORD_1 = self.nlp.Defaults.stop_words # stop word ของ spacy self.STOP_WORD_2 = stopwords.words('english') # stop word ของ nltk self.STOP_WORD_3 = STOP_WORDS # stop word ของ spacy # ------------------------------------------------------------------ # ----------------------detector-------------------------------------------- self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # -------------------------------------------------------------------------- self.dict_vocab = {i:False for i in self.vocabulary}
def ProcessText(self, text): dataBase = database() streetDf, addressDf = dataBase.ReadStreetName() streetList = dataBase.DataframeToList(streetDf) addressList = dataBase.DataframeToList(addressDf) districtList = dataBase.districtName_ wordList = districtList + streetList + addressList custom_words_list = set(thai_words()) custom_words_list.update(wordList) custom_words_list.update(self.specWord) trie = dict_trie(dict_source=custom_words_list) custom_tokenizer = Tokenizer(custom_dict=trie, engine=self.engineSel) proc = custom_tokenizer.word_tokenize(text) cleanList_1 = [] cleanList = [] [ cleanList_1.append( i.translate(str.maketrans('', '', string.punctuation))) for i in proc ] [ cleanList.append(i.translate(str.maketrans('', '', '1234567890'))) for i in cleanList_1 ] procText = list(filter(lambda x: x != " ", proc)) procText = list(filter(lambda x: x != " ", procText)) procText = list(filter(lambda x: x != "", procText)) #procText = list(filter(lambda x: len(x)>2, procText)) joinText = ' '.join(procText) #print(joinText) return joinText
'non dairy', '7 eleven', 'เซเว่น อีเลฟเว่น', 'เซเว่นอีเลฟเว่น', 'เซเว่น', 'เซเวน', '7 11', 'สตาร์บัค', 'อเมซอน', 'ท็อปส์', 'ทอปส์', 'ท้อปส์', 'ท๊อปส์', 'แมคโคร', 'แม็คโคร', 'โลตัส', 'บิ๊กซี', 'bigc', 'golden place', 'big c', 'ขายไม่ดี', 'แพคคู่', 'ค่าจัดส่ง', 'shelf life', 'พนักงานขายนม', 'ซื้อประจำ', 'หายาก', 'หาซื้อ', 'ของแถม', 'ราคาสูง', 'น้ำนมโค', 'นมโคแท้', 'นมแพะ', 'นมโรงเรียน', 'แพ้นม', 'แพ้นมวัว', 'นมอัดเม็ด', 'เล่นเวท', 'นำ้หนัก', 'คุณแม่มือใหม่', 'นมอุ่น', 'ชานม', 'กินนม', 'ดื่มนม', 'ท้องเสีย', 'ขี้แตก', 'คุมอาหาร', 'นักวิ่ง', 'ร้านนมสด', 'ดูแลสุขภาพ', 'คนท้อง', 'มวลกระดูก', 'คีเฟอร์นม', 'พันทิป', 'ร้านนม', 'เหมียวน้อย', 'ลูกสุนัข', 'ลูกหมา', 'คายทิ้ง', 'เจมส์ จิ', 'เจมส์จิ', 'ณเดช', 'ณเดชน์', 'สตอรี่', 'อยากสูง', 'ส่วนสูง', 'สูงขึ้น', 'รักษามะเร็ง', 'รักษาเบาหวาน', 'ไม่มี', 'ไม่ชอบ', 'ไม่ได้', 'ไม่อร่อย', 'ชาไข่มุก', 'ชานมไข่มุก', 'นมข้น', 'อเมซอน', 'นมเมจิสีฟ้า', 'ทำฟอง', 'ตีฟอง', 'โฟมนม', 'มื้อเช้า', 'ไขมันทรานส์', 'ดาราเดลี่', 'แดรี่ฟาร์ม', 'แดรี่ควีน' ] words = set(thai_words()).union(set(custom_list)) _trie = dict_trie(dict_source=words) _tokenizer = Tokenizer(custom_dict=_trie, engine=_TOKENIZER_ENGINE) ######################################################## def _is_stopword(word: str) -> bool: # เช็คว่าเป็นคำฟุ่มเฟือย return word in thai_stopwords() def _doc2features(doc, i) -> dict: word = doc[i][0] postag = doc[i][1] # Features from current word
from pythainlp import * from pythainlp.tag.named_entity import ThaiNameTagger from pythainlp.corpus.common import thai_words from pythainlp.util import dict_trie from decouple import config newWords = ["ไม่ดี", "ไม่พอใจ", "ชั่วคราว"] custom_words_list = set(thai_words()) custom_words_list.update(newWords) trie = dict_trie(dict_source=custom_words_list) custom_tokenizer = Tokenizer(custom_dict=trie, engine='newmm', keep_whitespace=False) class NLP: def __init__(self): self.positive_words = [] self.negative_words = [] self.swear_words = [] self.check_words = [] self.food_words = [] self.spa_words = [] self.beauty_words = [] self.travel_words = [] self.health_words = [] with open(config("NEGATIVE_SENTIMENT_WORDS"), 'r', encoding='utf-8') as f: for line in f: self.negative_words.append(line.rstrip())
''' Class: Keyword Purpose: Contains data of keyword ''' def __init__(self, word, weight): self.word = word # Type: String self.weight = weight # Type: Integer class TreeNode(): def __init__(self, data): self.data = data # Type: <Dynamic> self.children = list() # Type: List<Dynamic> THAI_WORDS = set(thai_words()) for i in open('requirement/data/custom_tokenizer.txt', encoding="utf-8"): THAI_WORDS.add(i.replace('\n', '').strip()) TOKENIZER = Tokenizer(THAI_WORDS) KEYWORDS_HIGH_PRIORITY = [ Keyword(i[0], int(i[1])) for i in array( read_csv('requirement/data/keywords/priority-high.csv')).tolist() ] KEYWORDS_MEDIUM_PRIORITY = [ Keyword(i[0], int(i[1])) for i in array( read_csv('requirement/data/keywords/priority-medium.csv')).tolist() ]
def Processing(E1): p_stemmer = PorterStemmer() ThaiWord = list(thaisw.words('thai')) #print(' Thaiwords : ', ThaiWord) EngWord = list(set(engsw.words('english'))) #print(' ew : ',EngWord, ' : ', type(EngWord)) Morewords = [ u'การ', u'การทำงาน', u'ทำงาน', u'เสมอ', u'krub', u'Test', u'nan', u' ', u'test', u'.', u',', u'ทำ', u'-', u'/' ] All_Stop_Word = ThaiWord + EngWord + Morewords #print(' ALL : ',All_Stop_Word) EntryList = [] for n in E1: # check=detect(n[0]) # th or en #print(' text : ', n[0], ' :: ',check) EntryList.append(n[0]) #print(' EntryList : ', EntryList) Outcome = [] for r in EntryList: Dummy = [] tokens = [] tokens = list(eng_tokens(r)) lowered = [t.lower() for t in tokens] #print(' Dummy : ',lowered) lowered = " ".join(lowered) #Dummy=list(thai_tokens(lowered, engine='newmm')) words = set(thai_words()) words.add(u'ไทยเบฟ') words.add(u'ผสานพลัง') words.add(u'โอกาส') words.add(u'ถังไม้โอ๊ค') custom_tokenizer = Tokenizer(words) Dummy = list(custom_tokenizer.word_tokenize(lowered)) #print(' Dummy 2 : ',Dummy) Outcome.append(Dummy) #print(' Outcome : ',Outcome, ' : ', len(Outcome)) NoStop = [] for n in Outcome: Dummy = [] Dummy = [word for word in n if word not in All_Stop_Word] NoStop.append(Dummy) print(' No stop : ', NoStop, ' len: ', len(NoStop)) Lemma = [] for n in NoStop: Dummy = [] Dummy = [p_stemmer.stem(word) for word in n] Lemma.append(Dummy) print(' Lemma : ', Lemma, ' len: ', len(Lemma)) ''' # Instantiate the WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # Lemmatize all tokens into a new list: lemmatized Lemma=[] for n in NoStop: Dummy=[] Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n] Lemma.append(Dummy) #print(' lemma : ', Lemma, ' :: ', type(Lemma)) ''' Lemma_temp = [] for n in Lemma: Dummy = [] for i in n: w_syn = wordnet.synsets(i) if (len(w_syn) > 0) and (len(w_syn[0].lemma_names('tha')) > 0): Dummy.append(w_syn[0].lemma_names('tha')[0]) else: Dummy.append(i) Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not i.isnumeric()] Lemma_temp.append(Dummy) Lemma = Lemma_temp Lemma_temp = [] for n in Lemma: Dummy = [] Dummy = [i for i in n if not ' ' in i] Lemma_temp.append(Dummy) Lemma = Lemma_temp #print(' lemma : ', Lemma, ' :: ', type(Lemma)) return Lemma