def funcao_limpa_tudo(artigo): #limpa a palavra, limita a palavra em uma lista_nova = [] #logger.info('Setting it to 0, do not use it in your scoring function.') #print(docs) artigo = stem_text(artigo) artigo = split_alphanum(artigo) artigo = tokenizer.tokenize( artigo) #artigo vira uma lista com todas as palavras list_artigo = list(artigo) #pesquisar o que é o list try: for palavra in list_artigo: palavra = palavra.encode('utf-8') if re.match('^\d+$', palavra): #artigo.remove(palavra) pass elif palavra in stop_words: #artigo.remove(palavra) pass elif len(palavra) < 3: #artigo.remove(palavra) pass else: lista_nova.append(palavra) #recebe palavra util except Exception as erro: #evita travar o codigo e continua, numero_palavra = artigo.index(palavra) artigo.pop(numero_palavra) #achou lixo del list_artigo #garbage do python #log.info("Lista de Tokens: %s", docs) if len(lista_nova) < 5: return None else: return lista_nova
def transformText(text): text = split_alphanum(text) # Convert text to lower text = text.lower() text = replace_word(text) text = replace_numbers(text) text = reduce_lengthening(text) text = remove_alphanumerics(text) # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]', r' ', text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in STOP_WORDS] # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # remove html markup text = re.sub("(<.*?>)", "", text) # Correct words spell = SpellChecker() misspelled = text.split() wordnet_lemmatizer = WordNetLemmatizer() for i in range(len(misspelled)): # Get the one `most likely` answer word = spell.correction(misspelled[i]) misspelled[i] = word misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v") misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n") text = " ".join(misspelled) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Strip all the numerics #text = gensim.parsing.preprocessing.strip_numeric(text) return text
def raw_text_preprocess(d): d = re.sub(r"http\S+", "", d) d = strip_non_alphanum(d).lower().strip() d = split_alphanum(d) d = strip_short(d, minsize=2) d = strip_numeric(d) d = ViTokenizer.tokenize(d) return d
def raw_text_preprocess(raw): raw = re.sub(r"http\S+", "", raw) raw = strip_non_alphanum(raw).lower().strip() raw = split_alphanum(raw) raw = strip_short(raw, minsize=2) raw = strip_numeric(raw) raw = ViTokenizer.tokenize(raw) return raw
def preprocess_mail(mail): mail = re.sub("https\S+", "", mail) # Loại bỏ các đường dẫn mail = strip_non_alphanum(mail).lower().strip( ) # Loại bỏ các kí tự không phải là chữ cái, chuyển tất cả kí tự thành chữ thường mail = split_alphanum(mail) # Tách văn bản thành các từ mail = strip_short( mail, minsize=2 ) # Lấy các từ có độ dài >= 2 kí tự, loại bỏ các từ có 1 chữ cái mail = strip_numeric(mail) # mail = ViTokenizer.tokenize(mail) return mail
def convert_num_words(cls, text): """ This method will numbers to text which help preprocessing much easier. """ text = split_alphanum(text) temp_str = text.split() new_string = [ INFLECT_ENGINE.number_to_words(word) if word.isdigit() else word for word in temp_str ] temp_str = " ".join(new_string) return temp_str
def preprocess_text(text): text = parse_html_v2(text) text = text.lower() text = remove_links_content(text) text = remove_emails(text) text = remove_special_tags(text) # remove content between {} text = remove_punctuation(text) # remove all puntuations text = split_alphanum(text) # add space between word and numeric text = strip_numeric(text) # remove digits text = strip_non_alphanum(text) # remove non-alphabetic characters text = strip_short(text, minsize=2) # remove word with length < minsize text = remove_multiple_space(text).strip() # remove space and strip text = ViTokenizer.tokenize(text) return text
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True, _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3, _strip_punctuation=False, _convert_to_lower = False): cleaner = textIn if _strip_tags: cleaner = strip_tags(textIn) if _strip_nonalphanumeric: cleaner = strip_non_alphanum(cleaner) if _strip_muliple_whitespace: cleaner = strip_multiple_whitespaces(cleaner) if _split_alphanumeric: cleaner = split_alphanum(cleaner) if _strip_short: cleaner = strip_short(cleaner, minsize=_short_charcount_min) if _convert_to_lower: cleaner = cleaner.lower() return cleaner
def transformText(text): text = split_alphanum(text) # Convert text to lower text = text.lower() text = replace_word(text) text = normaliser_word(text) #stops = set(stopwords.words("english")) stops={'at', 'only', 'your', 'yourself', 'a', 'i', 'during', 'off', 'myself', 'so', 'o', 'after', 'under', 'there', 'against', 'over', 'ourselves', 'they', 'me', 'its', 'then', 'above', 'theirs', 'this', 'into', 'from', 'very', 'on', 'yours', 'yourselves', 'herself', 'themselves', 'between', 'if', 'below', 'own', 'and', 'you', 'itself', 'him', 'while', 's', 'who', 'we', 'what', 'by', 'ma', 'further', 'such', 'until', 'through', 'too', 'until', 'through', 't', 'too', 'where', 'up', 'my', 'm', 'out', 'down', 're', 'to', 'she', 'd', 'those', 'when', 'it', 'because', 'he', 'in', 'other','each', 'both', 'her', 'but', 'as', 'all', 'his', 'again', 'with', 'once', 'am', 'just', 'should', 'why', 'than', 'any', 'should', 'why', 'than', 'more', 'most', 'that', 've', 'will', 'ours', 'our', 'll', 'the', 'y', 'which', 'whom', 'hers', 'an', 'here', 'how', 'before', 'about', 'for', 'them', 'these', 'their', 'for', 'them', 'these', 'their', 'or', 'must', 'shall', 'would', 'could' , 'need', 'might'} # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Correct words spell = SpellChecker() misspelled = text.split() wordnet_lemmatizer = WordNetLemmatizer() for i in range(len(misspelled)): # Get the one `most likely` answer word = spell.correction(misspelled[i]) misspelled[i]=word misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v") misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n") text = " ".join(misspelled) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] text = " ".join(filtered_words) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) return text
def text_preprocess(bodyItem): # bodyItem: string (of one mail) => return: list of words (of one mail) # Remove http, https bodyItem = re.sub(r'^https?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE) bodyItem = re.sub(r'^http?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE) bodyItem = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", bodyItem) # Decode some bodyItems which are not decoded bodyItem = bodyItem.replace("=", "%") bodyItem = urllib.parse.unquote(bodyItem) # Remove a word which has numbers and alphabets bodyItem = strip_number_alphabets(bodyItem) # Remove meaningless words, convert to lower words and split meaningful words bodyItem = strip_non_alphanum(bodyItem).lower().strip() bodyItem = split_alphanum(bodyItem) # Join two words which have meaning in Vietnamese. Ex: hội thảo -> hội_thảo bodyItem = ViTokenizer.tokenize(bodyItem) # Remove a word which has one letter bodyItem = strip_short(bodyItem, minsize=2) # Remove stopwords words = [word for word in bodyItem.split() if word not in stopwordsVN_ENG.getStopwordsVN_ENG()] return words
def funcao_limpa_tudo(artigo): tokenizer = RegexpTokenizer(r'\w+') lista_nova = [] #logger.info('Setting it to 0, do not use it in your scoring function.') #print(docs) artigo = stem_text(artigo) artigo = split_alphanum(artigo) artigo = tokenizer.tokenize(artigo) list_artigo = list(artigo) try: for palavra in list_artigo: palavra = palavra.encode('utf-8') if re.match('^\d+$', palavra): #artigo.remove(palavra) pass elif palavra in pt_stop: #artigo.remove(palavra) pass elif len(palavra) < 3: #artigo.remove(palavra) pass else: lista_nova.append(palavra) except Exception as erro: print(erro) #numero_palavra = artigo.index(palavra) #artigo.pop(numero_palavra) #achou lixo pass del list_artigo del artigo #log.info("Lista de Tokens: %s", docs) if len(lista_nova) < 5: return None else: return lista_nova
def test_split_alphanum(self): self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi") self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi")
def testSplitAlphanum(self): self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi") self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi")
def gensim_clean_string(textIn): cleaner = strip_tags(textIn) cleaner = split_alphanum(cleaner) cleaner = strip_multiple_whitespaces(cleaner) cleaner = strip_short(cleaner, minsize=3) return cleaner
def transformText(text): #pour séparer les numéros et les phrases text = split_alphanum(text) # Convert text to lower text = text.lower() text = replace_word(text) text = replace_numbers(text) text = reduce_lengthening(text) text = remove_alphanumerics(text) #stops = set(stopwords.words("english")) stopword = { 'only', 'being', 'yourselves', 'out', 'nor', 'few', 'now', 'd', 'before', 'their', 'and', 'ourselves', 'doing', 'its', "you've", 'yourself', 'off', 'do', 'm', 'but', 'after', 'during', 'has', 'for', 'above', 'y', 'on', 've', 'itself', 'been', 'until', 'we', 'shan', 're', 'll', 'o', 'did', 'there', 'you', 'further', 'some', 'where', 'through', 'doesn', 'a', 'it', 'does', 'who', 'most', 'she', 'each', 'am', 'if', 'hadn', 'him', 'the', 'those', 'will', "you'll", 'any', 'why', 'weren', 'ours', 'that', 'other', 'own', 'once', 'was', 'from', 'this', 'at', 'such', 'than', 'between', 'because', 'while', 'when', 'so', 'himself', 'them', 'didn', 's', 'ma', 'shouldn', 'my', 'only', 'then', 'herself', 'under', 'theirs', 'about', 'here', 'his', 'were', 'be', 'with', "you'd", 'are', 'both', 'haven', 'of', 'over', 'won', 'below', 'down', 'ain', 'or', 'to', 'too', 'again', 'an', 'needn', 'what', 'aren', "should've", 'me', 'hers', "that'll", 'just', 'same', 'which', "you're", 'having', "it's", 'can', 'these', 'more', 'i', 'up', 'yours', 'your', 'themselves', 'by', 'don', 'wasn', 'how', 'mightn', 'as', 'all', 'mustn', 'into', 'in', 'is', 't', 'he', 'had', 'her', 'our', 'they', 'have', 'against', 'whom', 'isn', "she's", 'myself' } # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]', r' ', text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stopword] #retourner les noms pluriels au singulier et les verbes à l'infinitif:limatisation input_str = word_tokenize(text) for i in range(len(input_str)): input_str[i] = lemmatizer.lemmatize(input_str[i], pos='v') input_str[i] = lemmatizer.lemmatize(input_str[i], pos='n') #correction d'orthographe spell = SpellChecker() misspelled = text.split() for i in range(len(misspelled)): # Get the one `most likely` answer word = spell.correction(misspelled[i]) misspelled[i] = word text = " ".join(misspelled) # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming return gensim.parsing.preprocessing.stem_text(text)