def text_to_wordlist(text, remove_stopwords=False, stem_words=False,comma=True): # Clean the text, with the option to remove stopwords and to stem words. import re # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text if not comma: text = re.sub(r"[,:\/\^.$%#+-></\?\=*\\]", " ", text) # origin is [^A-Za-z0-9^,!.\/'+-=?] else: pass # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return text
def clean_text(text): text = text.lower() text = REPLACE_BY_SPACE.sub(' ', text) text = BAD_SYMBOLS.sub(' ', text) text = re.sub(r"\'s", " ", text) text = ' '.join(word for word in text.split() if word not in STOPWORDS) return text
def prepare_for_char_n_gram(text): """ Simple text clean up process""" # 1. Go to lower case (only good for english) # Go to bytes_strings as I had issues removing all \n in r"" clean = bytes(text.lower(), encoding="utf-8") # 2. Drop \n and \t clean = clean.replace(b"\n", b" ") clean = clean.replace(b"\t", b" ") clean = clean.replace(b"\b", b" ") clean = clean.replace(b"\r", b" ") # 3. Replace english contractions for (pattern, repl) in patterns: clean = re.sub(pattern, repl, clean) # 4. Drop puntuation # I could have used regex package with regex.sub(b"\p{P}", " ") exclude = re.compile( b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8'))) clean = b" ".join([exclude.sub(b'', token) for token in clean.split()]) # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-) clean = re.sub(b"\d+", b" ", clean) # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences clean = re.sub(b'\s+', b' ', clean) # Remove ending space if any clean = re.sub(b'\s+$', b'', clean) # 7. Now replace words by words surrounded by # signs # e.g. my name is bond would become #my# #name# #is# #bond# # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean) clean = re.sub(b" ", b"# #", clean) # Replace space clean = b"#" + clean + b"#" # add leading and trailing # return str(clean, 'utf-8')
def tokenize(text): # remove urls detected_urls = re.findall(url_regex, text) for url in detected_urls: text = text.replace(url, "urlplaceholder") # Normalize text text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) #tokenize tokens = word_tokenize(text) # Remove stop words tokens = [w for w in tokens if w not in stopwords.words("english")] # Lemmatize lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).strip() clean_tokens.append(clean_tok) return ' '.join(clean_tokens)
def _remove_pattern(input_text_list): PATTERN_AT = "@[\w]*" HTTPS_LINKS = "https://[\w]*\.[\w]*/[\w\d]{,12}" NUMBER_SIGN = "#" USELESS_CHAR = "[^\w\s0-9\.,:;?!-_'\"\(\)\*]" # EMOJI_CHAR = "[\U00010000 -\U0010ffff\uD800 -\uDBFF\uDC00 -\uDFFF]" cleaned_text_list = [] for text in input_text_list: text = re.sub(PATTERN_AT, "", text) # 去掉@user text = re.sub(HTTPS_LINKS, "", text) # 去掉https链接 text = re.sub(NUMBER_SIGN, "", text) # 井号的英文是number_sign,去掉井号# text = re.sub(USELESS_CHAR, "", text) #去掉mian_char以外的字符,包括奇怪的字符、emoji等,留下字母、数字、主要标点符号 text = re.sub("\.", " .", text) # 将标点符号和单词分离,单独作为一个符号 text = re.sub(",", " ,", text) text = re.sub(":", " :", text) text = re.sub(";", " ;", text) text = re.sub("\?", " ?", text) text = re.sub("!", " !", text) text = re.sub("-", " -", text) text = re.sub("_", " _", text) text = re.sub("'", " '", text) text = re.sub("\"", " \" ", text) text = re.sub("\(", "", text) text = re.sub("\)", "", text) text = re.sub("\*", " * ", text) text = text.strip() text = text.lower() cleaned_text_list.append(text) return cleaned_text_list
def clean_text(text): ''' Pre process and convert texts to a list of words ''' text = str(text) text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have", text) text = re.sub(r"can't", "can not", text) text = re.sub(r"n't", "not", text) text = re.sub(r"i'm", "i am", text) text = re.sub(r"\'re", " are", text) text = re.sub(r"\'d", " would", text) text = re.sub(r"\'ll", " will", text) text = re.sub(r"\/", " / ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " america ", text) text = re.sub(r"\0s", " 0 ", text) text = re.sub(r" 9 11 ", " 911 ", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r" j k ", " jk ", text) text = re.sub(r"\s{2,}", " ", text) return text
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): """Converts a text to a sequence of words (or tokens). # Arguments text: Input text (string). filters: Sequence of characters to filter out. lower: Whether to convert the input to lowercase. split: Sentence split marker (string). # Returns A list of words (or tokens). """ if lower: text = text.lower() if sys.version_info < (3,) and isinstance(text, unicode): translate_map = dict((ord(c), unicode(split)) for c in filters) else: translate_map = maketrans(filters, split * len(filters)) text = text.translate(translate_map) #seq = text.split(split) seq = text.split() #seq = word_tokenize(text) #print("text:",seq) #pos_seq = nltk.pos_tag(text) #return [i for i in seq if i] return nltk.pos_tag(seq)
def clean_text(text): text = text.lower().split() text = " ".join(text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # return text
def process_sentence(text, objects='False'): ''' Simple and dirty text preprocessing to remove some mispelled words and lemmatize ''' text = text.lower() old_text = text text = text.replace('1', 'one').replace('2','two').replace( '3','three').replace('4','four').replace('5','five').replace('6','six').replace( '.','').replace('contains', 'contain').replace( 'which','').replace('are there','there are').replace( 'there is', '').replace('ablue', 'a blue').replace( 'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace( 'below','beneath').replace( 'brick','block').replace('leats','least').replace('is touching', 'touching') text = re.sub(r'colour([\W])', 'color ', text) text = re.sub(r'colored([\W])', 'color ', text) text = re.sub(r'coloured([\W])', 'color ', text) text = text.split(' ') text = map(correction, [t for t in text if t]) text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text] text = ' '.join(text) if 'that' in text: text = text.replace('that', '') if 'contain' in text or 'ha ' in text: text = text.replace('contain', 'with').replace('ha ','with ') text = re.sub(r'(^|\W)a([\W])', ' one ', text) text = re.sub(r'(^)ll ', ' ', text) text = re.sub(r'(^)t ', 'at ', text) text = ' '.join([t for t in text.split(' ') if t]) text = text.replace('based', 'base') return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) #Remove Special Characters text=special_character_removal.sub('',text) #Replace Numbers text=replace_numbers.sub('n',text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def preprocess(text): text = text.lower() list_happen = [ "😊", "❤️", "😁", "😄", "😆", "😍", "🤣", "😂", "🤩", "😚", "😋", '😜', "😝", "🤗", ":)", ":}", "^^", ";)", "👌", "=))", "😅", "👍", "👍🏻", "💕", "❤", "👏", "💟", "<3", ":D", ":P", "^_^", "😉", "✌️" ] list_sad = [ "😡", "🤔", "🤨", "😐", "😏", "😒", "😶", "🙄", "😌", "😔", "🤕", "🤒", "👿", "🤬", "😤", '😫', "😩", "😭", ":(", "😈", "-_-", "👎" ] for happen in list_happen: text = text.replace(happen, " vui") for sad in list_sad: text = text.replace(sad, " buồn") text = re.sub( '[\n!,.?@#?!.,#$%\()*+-/:;<=>@[\\]^_`{|}~`"""“”’∞θ÷α•−β∅³π‘₹´°£€\×™√²—–&]', '', text) # text = preprocess1(text) text = ViTokenizer.tokenize(text) # emoticons = re.findall(r"(?:|;|=)(?:-)?(?:\)\(|D|P)", text) # text = re.sub(r"[\W]+", " ", text.lower()) + " ".join(emoticons).replace('-', '') # text = re.sub("\n", ' ', text) return text
def text_to_word_list(text): text = str(text) text = text.lower() text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ", text) text = re.sub(r"\+", " ", text) text = re.sub(r"\-", " ", text) text = re.sub(r"\=", " ", text) text = re.sub(r"'", " ", text) text = re.sub(r">", " ", text) text = re.sub(r"<", " ", text) text = re.sub(r"--", " ", text) text = re.sub(r"`", " ", text) #text = re.sub(r"...", " ", text) text = re.sub(r"#", " ", text) text = re.sub(r"@", " ", text) text = re.sub(r"$", " ", text) text = re.sub(r"%", " ", text) text = re.sub(r"&", " ", text) text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) text = re.sub(r"\b\d+(?:\.\d+)?\s+", "", text) #text = re.sub(r"*", " ", text) #text = re.sub(r"[", " ", text) #text = re.sub(r"]", " ", text) #text = re.sub(r"(", " ", text) #text = re.sub(r")", " ", text) text = re.sub(r"_", " ", text) text = re.sub(r"\\", " ", text) text = re.sub(r"~", " ", text) #print(text) text = re.sub(r"(\d+)(k)", r"", text) text = re.sub(r":", " ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) #text = text.split() text_list = [] for word in nltk.word_tokenize(text): case = get_word(word) if case: text_list.append(word)
def text_to_wordlist(text, remove_stopwords=True, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9']", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " america ", text) text = re.sub(r" u s ", " america ", text) text = re.sub(r" uk ", " england ", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r" dms ", "direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iphone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"iii", "3", text) text = re.sub(r"the us", "america", text) text = re.sub(r" j k ", " jk ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def _remove_pattern_2(input_text_list): stoplist = read_stopwords() cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text) text = text.split() text = [word for word in text if word not in stoplist] ## 在提取词根前清除一次停用词 stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] cleanwordlist = [ word for word in stemmed_words if word not in stoplist ] ## 提取词根后,再清除 text = " ".join(cleanwordlist) cleaned_text_list.append(text) return cleaned_text_list
def clean(text): text = text.lower() # lowercase text text = re.sub('[0-9]+', '', text) text = BeautifulSoup(text, "html.parser").text # HTML decoding text = replace_with_space.sub( ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = bad_symbols.sub( '', text) # delete symbols which are in BAD_SYMBOLS_RE from text return text #Write function to conduct all text cleaning
def clean_text(text): text = remove_urls(text) text = remove_users(text.replace('@ ', '@')) # text = remove_hash_tags(text) text = re.sub(r'\s+', ' ', text).strip() text = filter(lambda x: x in printable, text) text = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", text) text = re.sub(r'\s+', ' ', text).strip() return text.lower()
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "): '''prune: sequence of characters to filter out ''' if lower: text = text.lower() text = text.translate(string.maketrans(filters, split*len(filters))) seq = text.split(split) sentences = [_f for _f in seq if _f] return bigram(sentences)
def clean_text(text): cleaned = text.lower() punctuations = string.punctuation cleaned = "".join(c for c in cleaned if c not in punctuations) words = cleaned.split() stopwords_lists = stopwords.words("english") cleaned = [word for word in words if word not in stopwords_lists] cleaned = [lem.lemmatize(word, "v") for word in cleaned] cleaned = [lem.lemmatize(word, "n") for word in cleaned] cleaned = " ".join(cleaned) return cleaned
def preprocess(text): text = text.lower() doc = ' '.join(re.findall(r"[\w']+|[.,!?;/-]", text)) #print doc doc = word_tokenize(doc) #doc = keras.preprocessing.text.Tokenizer(num_words=None,lower=True, split=" ").fit_on_texts(doc) #print doc #doc = [word for word in doc if word.isalpha()] #doc = [word for word in doc if word not in stop_words] #print doc return doc
def text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): if lower: text = text.lower() if type(text) == unicode: translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))} else: translate_table = maketrans(filters, split * len(filters)) text = text.translate(translate_table) seq = text.split(split) return [i for i in seq if i]
def clean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) return text
def clean_text(text): """ text: a string return: modified initial string """ text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwors from text return text
def removeStopWords(sen): text = re.sub('[^a-zA-z&]', ' ', sen) text = text.lower() text = text.split() ps = PorterStemmer() text = [ ps.stem(word) for word in text if not word in set(stopwords.words('english')) ] text = ' '.join(text) return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e ?-? ?mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
def clean_text(text): text = re.sub(r'\w*\d\w*', '', text).strip() # removes all words that contains numbers text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub( ' ', text ) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space. text = BAD_SYMBOLS_RE.sub( '', text ) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. text = re.sub('\d+', '', text) return text
def preprocess(text): # text = re.sub('[\n!,.?@#]', '', text) text = text.lower() list_happen = ["😊","❤️","😁","😄","😆","😍","🤣","😂","🤩","😚","😋",'😜',"😝","🤗",":)",":}","^^",";)", "👌","=))","😅","👍","👍🏻","💕","❤","👏","💟","<3",":D",":P","^_^","😉","✌️"] list_sad = ["😡","🤔","🤨","😐","😏","😒","😶","🙄","😌","😔","🤕","🤒","👿","🤬","😤",'😫',"😩","😭",":(","😈","-_-","👎"] for happen in list_happen: text = text.replace(happen, "vui") for sad in list_sad: text = text.replace(sad, "tệ") # text = ViTokenizer.tokenize(text) return text
def _remove_pattern_2(input_text_list): cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # Remove stop words # text = text.split() # stops = set(stopwords.words("english")) # text = [w for w in text if not w in stops and len(w) >= 3] # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # text = text.split() # stemmer = SnowballStemmer('english') # stemmed_words = [stemmer.stem(word) for word in text] # text = " ".join(stemmed_words) cleaned_text_list.append(text) return cleaned_text_list
def preprocessing(text): text = text.lower() text = re.sub(r'(\<)|(\>)', ' ', text) text = re.sub( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' url ', text) text = kp.replace_keywords(text) text = clean_punct(text) text = re.sub(r'\n\r', ' ', text) text = re.sub(r'\s{2,}', ' ', text) return text
def _preprocess(self, text): # lower if self.lower: text = text.lower() # remove special chars if self.initial_filters is not None: text = re.sub(self.initial_filters, ' ', text) # fuuuuck => f**k if self.remove_repetitions: pattern = re.compile(r"(.)\1{2,}", re.DOTALL) text = pattern.sub(r"\1", text) return text
def cleanser(questions): for i, text in enumerate(questions): text = text.lower() # Optionally, remove stop words #stops = set(stopwords.words("english")) #text = [w for w in text if not w in stops] #text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems #text = text.split() #stemmer = SnowballStemmer('english') #stemmed_words = [stemmer.stem(word) for word in text] questions[i] = text return questions