def cleanup_text(sentences): cleaned_text = [] for sent in sentences: # get hastag sent = re.sub(r'(?:^|\s)(\#\w+)', ' hastag', sent) # get email sent = re.sub(r'[\w\.-]+@[\w\.-]+', 'email', sent) # get url sent = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', ' url', sent) # dupplicate word # sent = re.sub(r'(\w)\1+', r'\1', sent) # delete string that contain number sent = re.sub(r"\w*\d\w*", ' ', sent) # y = re.sub(r'\b\d+([\.,]\d+)?', ' number', g) # delete number in string # g = re.sub(r"\d*([^\d\W]+)\d*", r'\1', z) sent = re.sub('[^\w ]', ' ', sent) sent = visen.clean_tone(sent) cleaned_text.append(sent) return cleaned_text
'[ỏ]':'ỏ','[ó]':'ó','[õ]':'õ','[ở]':'ở','[ổ]':'ổ','[ọ]':'ọ','[ỗ]':'ỗ','[ố]':'ố','[ồ]':'ồ','[ớ]':'ớ','[ờ]':'ờ', '[ự]':'ự','[ụ]':'ụ','[ú]':'ú','[ủ]':'ủ','[ứ]':'ứ','[ử]':'ử', } # must use encoding="utf-8" -- 2020-05-22 Japlin Chen f=open('./WordList.txt','r',encoding="utf-8") a=open('./vn_pinyin.txt','w',encoding="utf-8") SourceText = '' for i in f: # Fix Other UTF8 vietnamese character ##for UTF8regex, UTF8replace in UTF8Char.items(): ## SourceText = re.sub(UTF8regex, UTF8replace, i) # Fix Tone position Clean_Tone_Text = visen.clean_tone(i) # Get pure English Character => PinYin key NoToneText = visen.remove_tone(Clean_Tone_Text).strip() # Get the Telex of every vietnamese word ''' TelexText = '' TempTelexText = '' for word in Clean_Tone_Text.split(' '): TempTelexText = visen.get_enter_code(word) TelexText += TempTelexText + ' ' ''' # Save to file print(Clean_Tone_Text.strip() + ' ' + NoToneText + ' ' + '30000', file=a) #a.write(i.strip() + ' ' + convert(i) ) a.close()
def replace_all(text, dic): for i, j in dic.items(): text = re.sub(i, j, text) return text t = replace_all(s, abbrevs) print(t) # get hastag new_string = re.sub(r'(?:^|\s)(\#\w+)', ' hastag', t) # get email email = re.sub(r'[\w\.-]+@[\w\.-]+', 'email', new_string) # get url x = re.sub( r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', ' url', email) # dupplicate word z = re.sub(r'(\w)\1+', r'\1', x) # delete string that contain number g = re.sub(r"\w*\d\w*", ' ', z) # y = re.sub(r'\b\d+([\.,]\d+)?', ' number', g) # delete number in string # g = re.sub(r"\d*([^\d\W]+)\d*", r'\1', z) h = re.sub('[^\w ]', ' ', g) final = visen.clean_tone(h) remove_stop_word print(final)
wb = openpyxl.load_workbook('VietnameseWordList.xlsx') sheet = wb.active # -- open active sheet name Ex: 'Sheet1' last_text = '' #last_chinese = '' now_chinese = '' rcount = 0 count = 1 # -- for "A" column for column in list(sheet.columns)[0]: # -- if A[xxx] have data if column.value is not None: if count > 1: # Skip Excel Sheet1 Title # -- Fix Tone position Clean_Tone_Text = visen.clean_tone(column.value) # -- Get pure English Character => PinYin key NoToneText = visen.remove_tone(Clean_Tone_Text).strip() # -- Get the Telex key for each vietnamese word TelexText = '' TempTelexText = '' for word in Clean_Tone_Text.split(' '): # -- 'ươ' telex = 'uwow', now change to 'uow' TempTelexText = visen.get_enter_code(word).replace('uwow', 'uow') TelexText += TempTelexText + ' ' TelexText = TelexText.replace('\n', '').replace('\r', '').strip() # -- set Excel C/D/E column value sheet.cell(row=count,column=3).value = Clean_Tone_Text sheet.cell(row=count,column=4).value = NoToneText