def segment(text, data=None): if not data: wordcut = Wordcut.bigthai() else: word_list = list(set(data)) wordcut = Wordcut(word_list) return wordcut.tokenize(text)
def segment(text,data=""): if data=="": wordcut = Wordcut.bigthai() else: word_list = list(set(data)) wordcut = Wordcut(word_list) return wordcut.tokenize(text)
def main(): f = readFile('./data/comments-removing-redundant.csv') with open('bigthai.txt', encoding="UTF-8") as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) wordcut = Wordcut(word_list) dict_twogram = {} spamreader = csv.reader(f, delimiter=',') for row in spamreader: if (int(row[0]) >= 921 and int(row[0]) <= 2173): token = wordcut.tokenize(row[3].replace(' ', '')) twograms = ngrams(token, 2) try: for i in twograms: if i in dict_twogram: dict_twogram[i] += 1 else: dict_twogram[i] = 1 except (TypeError): pass sorted_x = sorted(dict_twogram.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x)
def segment(text): wordcut = Wordcut.bigthai() return wordcut.tokenize(text)
def warpcut(text): wordcut = Wordcut(word_list) return wordcut.tokenize(text)
def test(): with open('bigthai.txt', encoding="UTF-8") as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) wordcut = Wordcut(word_list) print(wordcut.tokenize("ไม่ค่อยชอบกลิ่นเลยค่ะ"))
def pattern_skinProtection(row, f, debugMode): sticky = 0 permeate = 0 stain = 0 smell = 0 moist = 0 irritate = 0 waterproof = 0 sunproof = 0 with open('bigthai.txt', encoding="UTF-8") as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) wordcut = Wordcut(word_list) comment = row[3].replace('ๆ', '').split(' ') for part in comment: token = wordcut.tokenize(part) try: for i in range(len(token)): pos_sentiment = False neg_sentiment = False inv_sentiment = False check_case_two = False case = 0 if token[i] in features_skin_protection: # 1,2 for checkcase in range(4): #0-3 if (i - checkcase >= 0 and i - checkcase < len(token) ) and (token[i - checkcase] in positive_sentiments_skin_protection or token[i - checkcase] in negative_sentiments_skin_protection): check_case_two = True if (check_case_two): # type 2 case = 2 for b in range(5): if i - b >= 0 and token[ i - b] in positive_sentiments_skin_protection: pos_sentiment = True elif i - b >= 0 and token[ i - b] in negative_sentiments_skin_protection: neg_sentiment = True elif i - b >= 0 and token[ i - b] in inverse_sentiments_skin_protection: inv_sentiment = True break else: #1 case = 1 for d in range(5): if d < 2: if (i - d >= 0 and i - d < len(token) ) and token[ i - d] in inverse_sentiments_skin_protection: inv_sentiment = True if d < 4: if (i + d >= 0 and i + 1 + d < len(token) ) and token[ i + d] in inverse_sentiments_skin_protection: inv_sentiment = True if d < 6: if (i + d >= 0 and i + d < len(token) ) and token[ i + d] in positive_sentiments_skin_protection: pos_sentiment = True break elif ( i + d >= 0 and i + d < len(token) ) and token[ i + d] in negative_sentiments_skin_protection: neg_sentiment = True break # score calculation # 'คราบ':0,'กลิ่น':0,'หอม':0,'ชุ่มชื่น':0,'ชุ่ม':0,'ระคายเคือง':0,'กันน้ำ':0,'กันแดด':0} if inv_sentiment == False: if token[i] in { 'เหนียวเหนอะหนะ', 'เหนอะหนะ', 'เหนียว', 'เหนอะ', 'หนืด' }: sticky += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'ซึม'}: permeate += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'คราบ'}: stain += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'กลิ่น', 'หอม'}: smell += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'ชุ่มชื่น', 'ชุ่ม'}: moist += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'ระคายเคือง'}: irritate += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'กันน้ำ'}: waterproof += int(pos_sentiment) - int( neg_sentiment) elif token[i] in {'กันแดด', 'แสงแดด'}: sunproof += int(pos_sentiment) - int( neg_sentiment) else: if token[i] in { 'เหนียวเหนอะหนะ', 'เหนอะหนะ', 'เหนียว', 'เหนอะ', 'หนืด' }: sticky += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'ซึม'}: permeate += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'คราบ'}: stain += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'กลิ่น'}: smell += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'ชุ่มชื่น', 'ชุ่ม'}: moist += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'ระคายเคือง'}: irritate += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'กันน้ำ'}: waterproof += -int(pos_sentiment) + int( neg_sentiment) elif token[i] in {'กันแดด', 'แสงแดด'}: sunproof += -int(pos_sentiment) + int( neg_sentiment) report(token, row[0], token[i], case, pos_sentiment, neg_sentiment, inv_sentiment, debugMode) else: #3 pass
def pattern_lipstick(row, f, debugMode): color = 0 smell = 0 durable = 0 with open('bigthai.txt', encoding="UTF-8") as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) wordcut = Wordcut(word_list) comment = row[3].replace('ๆ', '') token = wordcut.tokenize(comment) try: for i in range(len(token)): pos_sentiment = False neg_sentiment = False inv_sentiment = False check_case_two = False case = 0 if token[i] in features_lip: # 1,2 for a in range(2): check_case_two = (i - a >= 0) and ( token[i - a] in positive_sentiments_lip or token[i - a] in negative_sentiments_lip) if check_case_two: # 2 case = 2 for b in range(4): if b < 3 and i - b >= 0 and token[ i - b] in positive_sentiments_lip: pos_sentiment = True if b < 3 and i - b >= 0 and token[ i - b] in negative_sentiments_lip: neg_sentiment = True if i - b >= 0 and token[ i - b] in inverse_sentiments_lip: inv_sentiment = True break else: #1 case = 1 for d in range(5): if d < 2: if (i - 1 - d >= 0 and i - 1 - d < len(token) ) and token[i - 1 - d] in inverse_sentiments_lip: inv_sentiment = True if d < 4: if (i + 1 + d >= 0 and i + 1 + d < len(token) ) and token[i + 1 + d] in inverse_sentiments_lip: inv_sentiment = True if d < 5: if (i + 1 + d >= 0 and i + 1 + d < len(token) ) and token[i + 1 + d] in positive_sentiments_lip: pos_sentiment = True break elif (i + 1 + d >= 0 and i + 1 + d < len(token)) and token[ i + 1 + d] in negative_sentiments_lip: neg_sentiment = True break # score calculation if inv_sentiment == False: if token[i] == "สี": color += int(pos_sentiment) - int(neg_sentiment) elif token[i] == "กลิ่น": smell += int(pos_sentiment) - int(neg_sentiment) elif token[i] == "ติด": durable += int(pos_sentiment) - int(neg_sentiment) else: if token[i] == "สี": color += -int(pos_sentiment) + int(neg_sentiment) elif token[i] == "กลิ่น": smell += -int(pos_sentiment) + int(neg_sentiment) elif token[i] == "ติด": durable += -int(pos_sentiment) + int(neg_sentiment) report(token, row[0], token[i], case, pos_sentiment, neg_sentiment, inv_sentiment, debugMode) else: #3
#! -*- coding: UTF8 -*- from wordcut import Wordcut if __name__ == '__main__': with open('bigthai.txt') as dict_file: word_list = [w.rstrip() for w in dict_file.readlines()] word_list.sort() wordcut = Wordcut(word_list) print(wordcut.tokenize("กากา cat หมา"))
from pythainlp.tokenize import word_tokenize data = getdata() '''cut="\n".join(["|".join(word_tokenize(i))+"|" for i in data]) save(cut,"p1") from testcut import cutok as cut1 from testcut2 import cutok as cut2 from testcut3 import cutok as cut3 from testcut4 import cutok as cut4 cut="\n".join([cut1(i)+"|" for i in data]) save(cut,"p2") cut="\n".join([cut2(i)+"|" for i in data]) save(cut,"p3") cut="\n".join([cut3(i)+"|" for i in data]) save(cut,"p4") cut="\n".join([cut4(i)+"|" for i in data]) save(cut,"p5") cut="\n".join(["|".join(word_tokenize(i,engine="ulmfit"))+"|" for i in data]) save(cut,"p6") cut="\n".join(["|".join(word_tokenize(i,engine="longest"))+"|" for i in data]) save(cut,"p7") cut="\n".join(["|".join(word_tokenize(i,engine="mm"))+"|" for i in data]) save(cut,"p8") cut="\n".join(["|".join(word_tokenize(i,engine="icu"))+"|" for i in data]) save(cut,"p9")''' from wordcut import Wordcut wordcut = Wordcut.bigthai() cut = "\n".join(["|".join(wordcut.tokenize(i)) + "|" for i in data]) save(cut, "p11") cut = "\n".join( ["|".join(word_tokenize(i, engine="deepcut")) + "|" for i in data]) save(cut, "p10")
def default_segment(cls, inp): '''Segment an input with default model (bigthai)''' tokens = wordcut.bigthai().tokenize(inp) tokens = clean(tokens) return ' '.join(tokens)
def get_default_model(cls): '''Get default tokeniser model (bigthai)''' return wordcut.bigthai()
#! -*- coding: UTF8 -*- from wordcut import Wordcut if __name__ == '__main__': with open('bigthai.txt') as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) word_list.sort() wordcut = Wordcut(word_list) print(wordcut.tokenize("กากา cat หมา"))
from wordcut import Wordcut import pickle file = open('./final_process/text_no_space.pickle', 'rb') object_file = pickle.load(file) file.close() with open('./final_process/dict.txt', encoding='UTF-8') as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) wordcut = Wordcut(word_list) freq_words = {} for text in object_file: words = wordcut.tokenize(text) for word in words: if len(word) > 1: if word in freq_words: freq_words[word] += 1 else: freq_words[word] = 1 print(len(freq_words))
import csv from wordcut import Wordcut input_file = open('negative.txt', 'r') csv_file = open('negative.csv', 'w', newline='') writer = csv.writer(csv_file, dialect='excel', quoting=csv.QUOTE_ALL) with open('bigthai.txt') as dict_file: word_list = list(set([w.rstrip() for w in dict_file.readlines()])) word_list.sort() wordcut = Wordcut(word_list) for line in input_file: line = line.strip() space_count = line.count(' ') l = len(line) if (space_count * 2.8) > l: line = line.replace(' ', '') writer.writerow(wordcut.tokenize(line)) input_file.close() csv_file.close()
def setUp(self): self.wordcut = Wordcut.bigthai()