def compute_similarity_sentence(self, dst, src):
     dst, src = dst.lower(), src.lower()
     dst, src = str(ViUtils.remove_accents(dst.strip())), str(
         ViUtils.remove_accents(src.strip()))
     seq_match = SequenceMatcher(None, src, dst)
     match = seq_match.find_longest_match(0, len(src), 0, len(dst))
     return 0 if match.size == 0 else match.size / len(src)
Example #2
0
 def remove_accents(self):
     word_list = self.text.split(" ")
     list = []
     # print(word_list)
     for word in word_list:
         if '_' in word:
             sub_word = word.split('_')
             set = []
             for sub in sub_word:
                 set.append(ViUtils.remove_accents(sub))
             list.append(b'_'.join(set))
         else:
             list.append(ViUtils.remove_accents(word))
     return b" ".join(list)
def replaceWordRemoveAccents(sentences, listS, sReplace):
    s2 = sentences
    for ch in ['.', ',', '?']:
        s2 = s2.replace(ch, '')
    listWords = s2.split()
    for i in range(MAX_LENGTH, 0, -1):
        listA = listS[i - 1]
        if len(listA) == 0:
            continue
        for j in range(len(listWords) - i + 1):
            word = listWords[j]
            for k in range(j + 1, j + i):
                word = word + ' ' + listWords[k]
            word = ViUtils.remove_accents(word).decode('utf8')
            x = bisect_left(listA, word)
            if x != len(listA) and listA[x] == word:
                listWords[j] = sReplace
                for k in range(j + 1, j + i):
                    listWords[k] = ''

    s2 = ''
    for word in listWords:
        if word is not None and word != '':
            s2 = s2 + ' ' + word
    return s2[1:]
Example #4
0
 def got_khongdau(self,s):
     s = ViUtils.remove_accents(s)
     sr = s.decode()
     sr = sr.replace(' ',',')
     sr = sr.replace('.','')
     sr = re.sub('^,','',sr)
     return sr.lower()
Example #5
0
def get_inf_hc(lines):
    try:
        data = {}
        number_line = 2
        if 'vn' in lines[str(number_line)][0] or 'vnm' in lines[str(number_line)][0] or 'vm' in lines[str(number_line)][0]:
            data['so'] = toString(lines[str(number_line)][1:])
        number_line += 1
        if 'ho' in ViUtils.remove_accents(lines[str(number_line)][0]).decode('utf-8'):
            number_line += 1
            data['ht'] = toString(lines[str(number_line)], ' ').upper()
        number_line += 1
        if 'national' in lines[str(number_line)][2]:
            data['qt'] = toString(lines[str(number_line)][3:5], ' ')
        number_line += 1
        data['ns'] = ''
        for item in lines[str(number_line)]:
            data['ns'] += toString(re.findall("\d", item))
        if data['ns'] == '':
            number_line += 1
            if len(lines[str(number_line)][2]) == 4:
                data['ns'] = toString(lines[str(number_line)][:3])
                data['nq'] = toString(lines[str(number_line)][3:], ' ')
            elif len(lines[str(number_line)][3]) == 4:
                data['ns'] = toString(lines[str(number_line)][:4])
                data['nq'] = toString(lines[str(number_line)][4:],  ' ')
            elif len(lines[str(number_line)][4]) == 4:
                data['ns'] = toString(lines[str(number_line)][:5])
                data['nq'] = toString(lines[str(number_line)][5:],  ' ')
        data['ns'] = re.sub('[/a-zA-Z]', '', data['ns'])
        number_line += 2
        data['gt'] = lines[str(number_line)][0].capitalize()
    except BaseException as e:
        print(e)
    return data
Example #6
0
def got_message(message_chatbot):
    """Sử dụng thư viện pyvi loại bỏ tất cả các dấu trong câu"""
    try:
        mes = ViUtils.remove_accents(message_chatbot)
        message = mes.decode()
        message = clean_message(message)
        return message.lower()
    except Exception as e:
        logging.error('<file <handledata.py>> function<got_message>:' + str(e))
def fileToListRemoveAccents(fileName):
    fileStopWord = open(fileName, encoding='utf8')
    listS = [[] for i in range(MAX_LENGTH)]
    while True:
        s = fileStopWord.readline().replace('\n', '').lower()
        s = ViUtils.remove_accents(s).decode('utf8')
        if s is None or s == '':
            break
        listS[count_word(s) - 1].append(s)

    for i in range(MAX_LENGTH):
        listS[i].sort()
    return listS
Example #8
0
def remove_stop_words(line, stop_words, stop_words_no_accent):
    words = line.split()
    mid = 0
    line = ""
    for count, word in enumerate(words):
        word_no_accent = ViUtils.remove_accents(u"" + word)
        left = 0
        right = len(stop_words) - 1
        while (int((right - left) / 2) > 0):
            mid = int((right + left) / 2)
            # print(word_no_accent,stop_words_no_accent[mid],(word_no_accent>stop_words_no_accent[mid]),left,right)
            if (word_no_accent > stop_words_no_accent[mid]):
                left = mid
            elif (word_no_accent < stop_words_no_accent[mid]):
                right = mid
            elif ((word_no_accent == stop_words_no_accent[mid])):
                break
        if (word_no_accent != stop_words_no_accent[mid]):
            # print(word,stop_words[mid])
            line = line + word + " "
        else:
            if (mid >= 3 and mid <= (len(stop_words) - 4)):
                check = 0
                for i in range(mid - 3, mid + 3):
                    if (word == stop_words[i]):
                        check = 1
                        break
                if (check == 0):
                    line = line + word + " "
            elif (mid <= 3):
                check = 0
                for i in range(0, mid + 3):
                    if (word == stop_words[i]):
                        check = 1
                        break
                if (check == 0):
                    line = line + word + " "
            elif (mid >= (len(stop_words) - 4)):
                check = 0
                for i in range(mid - 5, mid):
                    if (word == stop_words[i]):
                        check = 1
                        break
                if (check == 0):
                    line = line + word + " "
        # else:
        #     print(word)
    return line
Example #9
0
def remove_stop_words(line):
    # Remove stopwords using binary search
    # So freaking cồng kềnh, I know
    # But screw it, lol
    words = line.split()
    mid = 0
    line = ""
    for count,word in enumerate(words):
        word_no_accent = ViUtils.remove_accents(u""+word)
        left = 0
        right = len(stop_words)-1
        while(int((right-left)/2)>0):
            mid = int((right+left)/2)
            if(word_no_accent>stop_words_no_accent[mid]):
                left = mid
            elif(word_no_accent<stop_words_no_accent[mid]):
                right = mid
            elif((word_no_accent==stop_words_no_accent[mid])):
                break
        if(word_no_accent != stop_words_no_accent[mid]):
            line = line+word+" "
        else:
            if(mid>=3 and mid<=(len(stop_words)-4)):
                check = 0
                for i in range(mid-3,mid+3):
                    if(word==stop_words[i]):
                        check = 1
                        break
                if(check == 0):
                    line = line+word+" "
            elif(mid <=3):
                check = 0
                for i in range(0,mid+3):
                    if(word==stop_words[i]):
                        check = 1
                        break
                if(check == 0):
                    line = line+word+" "
            elif(mid >= (len(stop_words)-4)):
                check = 0
                for i in range(mid - 5,mid):
                    if(word==stop_words[i]):
                        check = 1
                        break
                if(check == 0):
                    line = line+word+" "
    return line
def underthesea_prc(text):
    pos_tags = pos_tag(text)

    just_ner = ner(text)
    result = {}
    s = ''
    key = ''
    for index, x in enumerate(just_ner):
        ner_label = str(x[3]).split('-')
        if ner_label[0] == 'O' or index == len(just_ner) - 1:
            if s != '':
                if key not in result:
                    result[key] = []
                    result[key].append(s)
                else:
                    result[key].append(s)
                s = ''
        else:
            s = str(x[0])
            key = ner_label[1]
    ner_text = []
    for key, value in result.items():
        a = ''
        a += key + ": "
        value_len = len(value)
        for index, x in enumerate(value):
            a += x
            if index != value_len - 1:
                a += ", "
        ner_text.append(a)

    classify_result = ViUtils.add_accents(
        (classify(text)[0]).replace('_', ' '))

    sentiment_result = sentiment(text)

    return underthesea_text_result(pos_tags, ner_text, classify_result,
                                   sentiment_result)
Example #11
0
from pyvi import ViTokenizer, ViPosTagger, ViUtils

print(
    ViTokenizer.tokenize(
        u"Trung Quốc tố Mỹ vu khống WHO để trốn trách nhiệm với COVID-19"))

print(
    ViPosTagger.postagging(
        ViTokenizer.tokenize(
            u"Trung Quốc tố Mỹ vu khống WHO để trốn trách nhiệm với COVID-19"))
)

#from pyvi import
print(ViUtils.remove_accents(u"Trường đại học bách khoa hà nội"))

#from pyvi import ViUtils
print(ViUtils.add_accents(u'thu tuong yeu cau lam ro trach nhiem'))
Example #12
0
def got_message(message_chatbot):
        mes = ViUtils.remove_accents(message_chatbot)
        message = mes.decode()
        message = message.replace(' ',',')
        message = re.sub('^,','',message)
        return message.lower()
Example #13
0
from pyvi import ViTokenizer, ViPosTagger
from pyvi import ViUtils

stop_words = []
stop_words_file = "./data/VNmese-stopwords.txt"
stop_words_no_accent = []
with open(stop_words_file, 'r') as f:
    line = f.readline()
    while line:
        line = ViTokenizer.tokenize(u""+line)
        stop_words.append(line)
        stop_words_no_accent.append(ViUtils.remove_accents(u""+line))
        line = f.readline()

def remove_stop_words(line):
    # Remove stopwords using binary search
    # So freaking cồng kềnh, I know
    # But screw it, lol
    words = line.split()
    mid = 0
    line = ""
    for count,word in enumerate(words):
        word_no_accent = ViUtils.remove_accents(u""+word)
        left = 0
        right = len(stop_words)-1
        while(int((right-left)/2)>0):
            mid = int((right+left)/2)
            if(word_no_accent>stop_words_no_accent[mid]):
                left = mid
            elif(word_no_accent<stop_words_no_accent[mid]):
# import json
#
# file = open('data/orderInfo.json', 'r', encoding='utf-8')
# data = json.load(file)
# new = {
#     "id": "",
#     "product": "",
#     "name": "",
#     "phone": "",
#     "address": ""
# }
# data.append(new)
# print(data)
# file = open('data/orderInfo.json', 'w', encoding='utf-8')
# json.dump(data, file, indent=2, ensure_ascii=False)
from pyvi import ViTokenizer, ViPosTagger, ViUtils

ViTokenizer.tokenize(u"Trường đại học bách khoa hà nội")

ViPosTagger.postagging(
    ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội"))

ViUtils.remove_accents(u"Trường đại học bách khoa hà nội")

ViUtils.add_accents(u'truong dai hoc bach khoa ha noi')
Example #15
0
'https://github.com/trungtv/pyvi'



from pyvi import ViTokenizer, ViPosTagger, ViUtils

str = u'Trường Đại học Bách Khoa hà nội'
a = ViTokenizer.tokenize(str)
print((a))

print(ViPosTagger.postagging(ViTokenizer.tokenize(str)))

print(ViUtils.remove_accents(str))

print(ViUtils.add_accents(u'truong dai hoc bach khoa ha noi'))




'''POS TAGS:

A - Adjective
C - Coordinating conjunction
E - Preposition
I - Interjection
L - Determiner
M - Numeral
N - Common noun
Nc - Noun Classifier
Ny - Noun abbreviation
Np - Proper noun