Esempio n. 1
0
def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 2
0
def segment(text, data=None):
    if not data:
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 3
0
def segment(text,data=""):
    if data=="":
        wordcut = Wordcut.bigthai()
    else:
        word_list = list(set(data))
        wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 4
0
def main():
    f = readFile('./data/comments-removing-redundant.csv')
    with open('bigthai.txt', encoding="UTF-8") as dict_file:
        word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
        wordcut = Wordcut(word_list)
        dict_twogram = {}
        spamreader = csv.reader(f, delimiter=',')
        for row in spamreader:
            if (int(row[0]) >= 921 and int(row[0]) <= 2173):
                token = wordcut.tokenize(row[3].replace(' ', ''))
                twograms = ngrams(token, 2)
                try:
                    for i in twograms:
                        if i in dict_twogram:
                            dict_twogram[i] += 1
                        else:
                            dict_twogram[i] = 1
                except (TypeError):
                    pass
        sorted_x = sorted(dict_twogram.items(),
                          key=operator.itemgetter(1),
                          reverse=True)
        print(sorted_x)
Esempio n. 5
0
def segment(text):
    wordcut = Wordcut.bigthai()
    return wordcut.tokenize(text)
Esempio n. 6
0
def warpcut(text):
    wordcut = Wordcut(word_list)
    return wordcut.tokenize(text)
Esempio n. 7
0
def test():
    with open('bigthai.txt', encoding="UTF-8") as dict_file:
        word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
        wordcut = Wordcut(word_list)
        print(wordcut.tokenize("ไม่ค่อยชอบกลิ่นเลยค่ะ"))
Esempio n. 8
0
def pattern_skinProtection(row, f, debugMode):
    sticky = 0
    permeate = 0
    stain = 0
    smell = 0
    moist = 0
    irritate = 0
    waterproof = 0
    sunproof = 0
    with open('bigthai.txt', encoding="UTF-8") as dict_file:
        word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
        wordcut = Wordcut(word_list)
        comment = row[3].replace('ๆ', '').split(' ')
        for part in comment:
            token = wordcut.tokenize(part)
            try:
                for i in range(len(token)):
                    pos_sentiment = False
                    neg_sentiment = False
                    inv_sentiment = False
                    check_case_two = False
                    case = 0
                    if token[i] in features_skin_protection:  # 1,2
                        for checkcase in range(4):  #0-3
                            if (i - checkcase >= 0
                                    and i - checkcase < len(token)
                                ) and (token[i - checkcase]
                                       in positive_sentiments_skin_protection
                                       or token[i - checkcase]
                                       in negative_sentiments_skin_protection):
                                check_case_two = True
                        if (check_case_two):  # type 2
                            case = 2
                            for b in range(5):
                                if i - b >= 0 and token[
                                        i -
                                        b] in positive_sentiments_skin_protection:
                                    pos_sentiment = True
                                elif i - b >= 0 and token[
                                        i -
                                        b] in negative_sentiments_skin_protection:
                                    neg_sentiment = True
                                elif i - b >= 0 and token[
                                        i -
                                        b] in inverse_sentiments_skin_protection:
                                    inv_sentiment = True
                                    break
                        else:  #1
                            case = 1
                            for d in range(5):
                                if d < 2:
                                    if (i - d >= 0 and i - d < len(token)
                                        ) and token[
                                            i -
                                            d] in inverse_sentiments_skin_protection:
                                        inv_sentiment = True
                                if d < 4:
                                    if (i + d >= 0 and i + 1 + d < len(token)
                                        ) and token[
                                            i +
                                            d] in inverse_sentiments_skin_protection:
                                        inv_sentiment = True
                                if d < 6:
                                    if (i + d >= 0 and i + d < len(token)
                                        ) and token[
                                            i +
                                            d] in positive_sentiments_skin_protection:
                                        pos_sentiment = True
                                        break
                                    elif (
                                            i + d >= 0 and i + d < len(token)
                                    ) and token[
                                            i +
                                            d] in negative_sentiments_skin_protection:
                                        neg_sentiment = True
                                        break

                        # score calculation
                        # 'คราบ':0,'กลิ่น':0,'หอม':0,'ชุ่มชื่น':0,'ชุ่ม':0,'ระคายเคือง':0,'กันน้ำ':0,'กันแดด':0}
                        if inv_sentiment == False:
                            if token[i] in {
                                    'เหนียวเหนอะหนะ', 'เหนอะหนะ', 'เหนียว',
                                    'เหนอะ', 'หนืด'
                            }:
                                sticky += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'ซึม'}:
                                permeate += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'คราบ'}:
                                stain += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'กลิ่น', 'หอม'}:
                                smell += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'ชุ่มชื่น', 'ชุ่ม'}:
                                moist += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'ระคายเคือง'}:
                                irritate += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'กันน้ำ'}:
                                waterproof += int(pos_sentiment) - int(
                                    neg_sentiment)
                            elif token[i] in {'กันแดด', 'แสงแดด'}:
                                sunproof += int(pos_sentiment) - int(
                                    neg_sentiment)
                        else:
                            if token[i] in {
                                    'เหนียวเหนอะหนะ', 'เหนอะหนะ', 'เหนียว',
                                    'เหนอะ', 'หนืด'
                            }:
                                sticky += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'ซึม'}:
                                permeate += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'คราบ'}:
                                stain += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'กลิ่น'}:
                                smell += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'ชุ่มชื่น', 'ชุ่ม'}:
                                moist += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'ระคายเคือง'}:
                                irritate += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'กันน้ำ'}:
                                waterproof += -int(pos_sentiment) + int(
                                    neg_sentiment)
                            elif token[i] in {'กันแดด', 'แสงแดด'}:
                                sunproof += -int(pos_sentiment) + int(
                                    neg_sentiment)

                        report(token, row[0], token[i], case, pos_sentiment,
                               neg_sentiment, inv_sentiment, debugMode)

                    else:  #3
                        pass
Esempio n. 9
0
def pattern_lipstick(row, f, debugMode):
    color = 0
    smell = 0
    durable = 0
    with open('bigthai.txt', encoding="UTF-8") as dict_file:
        word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
        wordcut = Wordcut(word_list)
        comment = row[3].replace('ๆ', '')
        token = wordcut.tokenize(comment)
        try:
            for i in range(len(token)):
                pos_sentiment = False
                neg_sentiment = False
                inv_sentiment = False
                check_case_two = False
                case = 0
                if token[i] in features_lip:  # 1,2
                    for a in range(2):
                        check_case_two = (i - a >= 0) and (
                            token[i - a] in positive_sentiments_lip
                            or token[i - a] in negative_sentiments_lip)
                    if check_case_two:  # 2
                        case = 2
                        for b in range(4):
                            if b < 3 and i - b >= 0 and token[
                                    i - b] in positive_sentiments_lip:
                                pos_sentiment = True
                            if b < 3 and i - b >= 0 and token[
                                    i - b] in negative_sentiments_lip:
                                neg_sentiment = True
                            if i - b >= 0 and token[
                                    i - b] in inverse_sentiments_lip:
                                inv_sentiment = True
                                break
                    else:  #1
                        case = 1
                        for d in range(5):
                            if d < 2:
                                if (i - 1 - d >= 0 and i - 1 - d < len(token)
                                    ) and token[i - 1 -
                                                d] in inverse_sentiments_lip:
                                    inv_sentiment = True
                            if d < 4:
                                if (i + 1 + d >= 0 and i + 1 + d < len(token)
                                    ) and token[i + 1 +
                                                d] in inverse_sentiments_lip:
                                    inv_sentiment = True
                            if d < 5:
                                if (i + 1 + d >= 0 and i + 1 + d < len(token)
                                    ) and token[i + 1 +
                                                d] in positive_sentiments_lip:
                                    pos_sentiment = True
                                    break
                                elif (i + 1 + d >= 0
                                      and i + 1 + d < len(token)) and token[
                                          i + 1 +
                                          d] in negative_sentiments_lip:
                                    neg_sentiment = True
                                    break

                    # score calculation
                    if inv_sentiment == False:
                        if token[i] == "สี":
                            color += int(pos_sentiment) - int(neg_sentiment)
                        elif token[i] == "กลิ่น":
                            smell += int(pos_sentiment) - int(neg_sentiment)
                        elif token[i] == "ติด":
                            durable += int(pos_sentiment) - int(neg_sentiment)
                    else:
                        if token[i] == "สี":
                            color += -int(pos_sentiment) + int(neg_sentiment)
                        elif token[i] == "กลิ่น":
                            smell += -int(pos_sentiment) + int(neg_sentiment)
                        elif token[i] == "ติด":
                            durable += -int(pos_sentiment) + int(neg_sentiment)
                    report(token, row[0], token[i], case, pos_sentiment,
                           neg_sentiment, inv_sentiment, debugMode)

                else:  #3
Esempio n. 10
0
#! -*- coding: UTF8 -*-
from wordcut import Wordcut
if __name__ == '__main__':
    with open('bigthai.txt') as dict_file:
        word_list = [w.rstrip() for w in dict_file.readlines()]
        word_list.sort()
        wordcut = Wordcut(word_list)
        print(wordcut.tokenize("กากา cat หมา"))
Esempio n. 11
0
from pythainlp.tokenize import word_tokenize
data = getdata()
'''cut="\n".join(["|".join(word_tokenize(i))+"|" for i in data])
save(cut,"p1")
from testcut import cutok as cut1
from testcut2 import cutok as cut2
from testcut3 import cutok as cut3
from testcut4 import cutok as cut4
cut="\n".join([cut1(i)+"|" for i in data])
save(cut,"p2")
cut="\n".join([cut2(i)+"|" for i in data])
save(cut,"p3")
cut="\n".join([cut3(i)+"|" for i in data])
save(cut,"p4")
cut="\n".join([cut4(i)+"|" for i in data])
save(cut,"p5")
cut="\n".join(["|".join(word_tokenize(i,engine="ulmfit"))+"|" for i in data])
save(cut,"p6")
cut="\n".join(["|".join(word_tokenize(i,engine="longest"))+"|" for i in data])
save(cut,"p7")
cut="\n".join(["|".join(word_tokenize(i,engine="mm"))+"|" for i in data])
save(cut,"p8")
cut="\n".join(["|".join(word_tokenize(i,engine="icu"))+"|" for i in data])
save(cut,"p9")'''
from wordcut import Wordcut
wordcut = Wordcut.bigthai()
cut = "\n".join(["|".join(wordcut.tokenize(i)) + "|" for i in data])
save(cut, "p11")
cut = "\n".join(
    ["|".join(word_tokenize(i, engine="deepcut")) + "|" for i in data])
save(cut, "p10")
Esempio n. 12
0
 def default_segment(cls, inp):
     '''Segment an input with default model (bigthai)'''
     tokens = wordcut.bigthai().tokenize(inp)
     tokens = clean(tokens)
     return ' '.join(tokens)
Esempio n. 13
0
 def get_default_model(cls):
     '''Get default tokeniser model (bigthai)'''
     return wordcut.bigthai()
Esempio n. 14
0
#! -*- coding: UTF8 -*-
from wordcut import Wordcut
if __name__ == '__main__':
    with open('bigthai.txt') as dict_file:
        word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
        word_list.sort()
        wordcut = Wordcut(word_list)
        print(wordcut.tokenize("กากา cat หมา"))
Esempio n. 15
0
from wordcut import Wordcut
import pickle

file = open('./final_process/text_no_space.pickle', 'rb')
object_file = pickle.load(file)
file.close()

with open('./final_process/dict.txt', encoding='UTF-8') as dict_file:
    word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
    wordcut = Wordcut(word_list)

freq_words = {}

for text in object_file:
    words = wordcut.tokenize(text)
    for word in words:
        if len(word) > 1:
            if word in freq_words:
                freq_words[word] += 1
            else:
                freq_words[word] = 1

print(len(freq_words))
Esempio n. 16
0
import csv
from wordcut import Wordcut

input_file = open('negative.txt', 'r')
csv_file = open('negative.csv', 'w', newline='')

writer = csv.writer(csv_file, dialect='excel', quoting=csv.QUOTE_ALL)

with open('bigthai.txt') as dict_file:
    word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
    word_list.sort()
    wordcut = Wordcut(word_list)

    for line in input_file:
        line = line.strip()
        space_count = line.count(' ')
        l = len(line)
        if (space_count * 2.8) > l:
            line = line.replace(' ', '')
        writer.writerow(wordcut.tokenize(line))

input_file.close()
csv_file.close()
Esempio n. 17
0
 def setUp(self):
     self.wordcut = Wordcut.bigthai()