def compute_similarity_sentence(self, dst, src): dst, src = dst.lower(), src.lower() dst, src = str(ViUtils.remove_accents(dst.strip())), str( ViUtils.remove_accents(src.strip())) seq_match = SequenceMatcher(None, src, dst) match = seq_match.find_longest_match(0, len(src), 0, len(dst)) return 0 if match.size == 0 else match.size / len(src)
def remove_accents(self): word_list = self.text.split(" ") list = [] # print(word_list) for word in word_list: if '_' in word: sub_word = word.split('_') set = [] for sub in sub_word: set.append(ViUtils.remove_accents(sub)) list.append(b'_'.join(set)) else: list.append(ViUtils.remove_accents(word)) return b" ".join(list)
def replaceWordRemoveAccents(sentences, listS, sReplace): s2 = sentences for ch in ['.', ',', '?']: s2 = s2.replace(ch, '') listWords = s2.split() for i in range(MAX_LENGTH, 0, -1): listA = listS[i - 1] if len(listA) == 0: continue for j in range(len(listWords) - i + 1): word = listWords[j] for k in range(j + 1, j + i): word = word + ' ' + listWords[k] word = ViUtils.remove_accents(word).decode('utf8') x = bisect_left(listA, word) if x != len(listA) and listA[x] == word: listWords[j] = sReplace for k in range(j + 1, j + i): listWords[k] = '' s2 = '' for word in listWords: if word is not None and word != '': s2 = s2 + ' ' + word return s2[1:]
def got_khongdau(self,s): s = ViUtils.remove_accents(s) sr = s.decode() sr = sr.replace(' ',',') sr = sr.replace('.','') sr = re.sub('^,','',sr) return sr.lower()
def get_inf_hc(lines): try: data = {} number_line = 2 if 'vn' in lines[str(number_line)][0] or 'vnm' in lines[str(number_line)][0] or 'vm' in lines[str(number_line)][0]: data['so'] = toString(lines[str(number_line)][1:]) number_line += 1 if 'ho' in ViUtils.remove_accents(lines[str(number_line)][0]).decode('utf-8'): number_line += 1 data['ht'] = toString(lines[str(number_line)], ' ').upper() number_line += 1 if 'national' in lines[str(number_line)][2]: data['qt'] = toString(lines[str(number_line)][3:5], ' ') number_line += 1 data['ns'] = '' for item in lines[str(number_line)]: data['ns'] += toString(re.findall("\d", item)) if data['ns'] == '': number_line += 1 if len(lines[str(number_line)][2]) == 4: data['ns'] = toString(lines[str(number_line)][:3]) data['nq'] = toString(lines[str(number_line)][3:], ' ') elif len(lines[str(number_line)][3]) == 4: data['ns'] = toString(lines[str(number_line)][:4]) data['nq'] = toString(lines[str(number_line)][4:], ' ') elif len(lines[str(number_line)][4]) == 4: data['ns'] = toString(lines[str(number_line)][:5]) data['nq'] = toString(lines[str(number_line)][5:], ' ') data['ns'] = re.sub('[/a-zA-Z]', '', data['ns']) number_line += 2 data['gt'] = lines[str(number_line)][0].capitalize() except BaseException as e: print(e) return data
def got_message(message_chatbot): """Sử dụng thư viện pyvi loại bỏ tất cả các dấu trong câu""" try: mes = ViUtils.remove_accents(message_chatbot) message = mes.decode() message = clean_message(message) return message.lower() except Exception as e: logging.error('<file <handledata.py>> function<got_message>:' + str(e))
def fileToListRemoveAccents(fileName): fileStopWord = open(fileName, encoding='utf8') listS = [[] for i in range(MAX_LENGTH)] while True: s = fileStopWord.readline().replace('\n', '').lower() s = ViUtils.remove_accents(s).decode('utf8') if s is None or s == '': break listS[count_word(s) - 1].append(s) for i in range(MAX_LENGTH): listS[i].sort() return listS
def remove_stop_words(line, stop_words, stop_words_no_accent): words = line.split() mid = 0 line = "" for count, word in enumerate(words): word_no_accent = ViUtils.remove_accents(u"" + word) left = 0 right = len(stop_words) - 1 while (int((right - left) / 2) > 0): mid = int((right + left) / 2) # print(word_no_accent,stop_words_no_accent[mid],(word_no_accent>stop_words_no_accent[mid]),left,right) if (word_no_accent > stop_words_no_accent[mid]): left = mid elif (word_no_accent < stop_words_no_accent[mid]): right = mid elif ((word_no_accent == stop_words_no_accent[mid])): break if (word_no_accent != stop_words_no_accent[mid]): # print(word,stop_words[mid]) line = line + word + " " else: if (mid >= 3 and mid <= (len(stop_words) - 4)): check = 0 for i in range(mid - 3, mid + 3): if (word == stop_words[i]): check = 1 break if (check == 0): line = line + word + " " elif (mid <= 3): check = 0 for i in range(0, mid + 3): if (word == stop_words[i]): check = 1 break if (check == 0): line = line + word + " " elif (mid >= (len(stop_words) - 4)): check = 0 for i in range(mid - 5, mid): if (word == stop_words[i]): check = 1 break if (check == 0): line = line + word + " " # else: # print(word) return line
def remove_stop_words(line): # Remove stopwords using binary search # So freaking cồng kềnh, I know # But screw it, lol words = line.split() mid = 0 line = "" for count,word in enumerate(words): word_no_accent = ViUtils.remove_accents(u""+word) left = 0 right = len(stop_words)-1 while(int((right-left)/2)>0): mid = int((right+left)/2) if(word_no_accent>stop_words_no_accent[mid]): left = mid elif(word_no_accent<stop_words_no_accent[mid]): right = mid elif((word_no_accent==stop_words_no_accent[mid])): break if(word_no_accent != stop_words_no_accent[mid]): line = line+word+" " else: if(mid>=3 and mid<=(len(stop_words)-4)): check = 0 for i in range(mid-3,mid+3): if(word==stop_words[i]): check = 1 break if(check == 0): line = line+word+" " elif(mid <=3): check = 0 for i in range(0,mid+3): if(word==stop_words[i]): check = 1 break if(check == 0): line = line+word+" " elif(mid >= (len(stop_words)-4)): check = 0 for i in range(mid - 5,mid): if(word==stop_words[i]): check = 1 break if(check == 0): line = line+word+" " return line
def underthesea_prc(text): pos_tags = pos_tag(text) just_ner = ner(text) result = {} s = '' key = '' for index, x in enumerate(just_ner): ner_label = str(x[3]).split('-') if ner_label[0] == 'O' or index == len(just_ner) - 1: if s != '': if key not in result: result[key] = [] result[key].append(s) else: result[key].append(s) s = '' else: s = str(x[0]) key = ner_label[1] ner_text = [] for key, value in result.items(): a = '' a += key + ": " value_len = len(value) for index, x in enumerate(value): a += x if index != value_len - 1: a += ", " ner_text.append(a) classify_result = ViUtils.add_accents( (classify(text)[0]).replace('_', ' ')) sentiment_result = sentiment(text) return underthesea_text_result(pos_tags, ner_text, classify_result, sentiment_result)
from pyvi import ViTokenizer, ViPosTagger, ViUtils print( ViTokenizer.tokenize( u"Trung Quốc tố Mỹ vu khống WHO để trốn trách nhiệm với COVID-19")) print( ViPosTagger.postagging( ViTokenizer.tokenize( u"Trung Quốc tố Mỹ vu khống WHO để trốn trách nhiệm với COVID-19")) ) #from pyvi import print(ViUtils.remove_accents(u"Trường đại học bách khoa hà nội")) #from pyvi import ViUtils print(ViUtils.add_accents(u'thu tuong yeu cau lam ro trach nhiem'))
def got_message(message_chatbot): mes = ViUtils.remove_accents(message_chatbot) message = mes.decode() message = message.replace(' ',',') message = re.sub('^,','',message) return message.lower()
from pyvi import ViTokenizer, ViPosTagger from pyvi import ViUtils stop_words = [] stop_words_file = "./data/VNmese-stopwords.txt" stop_words_no_accent = [] with open(stop_words_file, 'r') as f: line = f.readline() while line: line = ViTokenizer.tokenize(u""+line) stop_words.append(line) stop_words_no_accent.append(ViUtils.remove_accents(u""+line)) line = f.readline() def remove_stop_words(line): # Remove stopwords using binary search # So freaking cồng kềnh, I know # But screw it, lol words = line.split() mid = 0 line = "" for count,word in enumerate(words): word_no_accent = ViUtils.remove_accents(u""+word) left = 0 right = len(stop_words)-1 while(int((right-left)/2)>0): mid = int((right+left)/2) if(word_no_accent>stop_words_no_accent[mid]): left = mid elif(word_no_accent<stop_words_no_accent[mid]):
# import json # # file = open('data/orderInfo.json', 'r', encoding='utf-8') # data = json.load(file) # new = { # "id": "", # "product": "", # "name": "", # "phone": "", # "address": "" # } # data.append(new) # print(data) # file = open('data/orderInfo.json', 'w', encoding='utf-8') # json.dump(data, file, indent=2, ensure_ascii=False) from pyvi import ViTokenizer, ViPosTagger, ViUtils ViTokenizer.tokenize(u"Trường đại học bách khoa hà nội") ViPosTagger.postagging( ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội")) ViUtils.remove_accents(u"Trường đại học bách khoa hà nội") ViUtils.add_accents(u'truong dai hoc bach khoa ha noi')
'https://github.com/trungtv/pyvi' from pyvi import ViTokenizer, ViPosTagger, ViUtils str = u'Trường Đại học Bách Khoa hà nội' a = ViTokenizer.tokenize(str) print((a)) print(ViPosTagger.postagging(ViTokenizer.tokenize(str))) print(ViUtils.remove_accents(str)) print(ViUtils.add_accents(u'truong dai hoc bach khoa ha noi')) '''POS TAGS: A - Adjective C - Coordinating conjunction E - Preposition I - Interjection L - Determiner M - Numeral N - Common noun Nc - Noun Classifier Ny - Noun abbreviation Np - Proper noun