def preprocess_siamese(doc): # pre-process data doc = tp.normalize.normalize_unicode(doc) doc = tp.normalize_whitespace(doc) doc = tp.normalize_quotation_marks(doc) doc = tp.replace_emails(doc, replace_with="<EMAIL>") doc = tp.replace_urls(doc, replace_with="<URL>") doc = tp.replace_hashtags(doc, replace_with="<HASHTAG>") doc = tp.replace_emojis(doc, replace_with="<EMOJI>") doc = tp.replace_phone_numbers(doc, replace_with="<PHONE>") # apply spaCy to tokenize doc doc = nlp_token(doc) # build new sentences for pre-processed doc doc_new = [] for sent in doc.sents: sent_new = "" for token in sent: token = token.text token = token.replace("\n", "") token = token.replace("\t", "") token = token.strip() sent_new += token + " " doc_new.append(sent_new[:-1]) return doc_new
def clean(text: str) -> str: txt = text.strip() txt = preprocessing.normalize_unicode(txt, form="NFKC") # Collapse whitespaces txt = preprocessing.normalize_whitespace(txt) # Remove newlines txt = preprocessing.normalize_repeating_chars(txt, chars="\n", maxn=1) # fix hyphen-ated words txt = preprocessing.normalize_hyphenated_words(txt) txt = preprocessing.normalize_quotation_marks(txt) txt = preprocessing.replace_urls(txt, replace_with="") txt = preprocessing.replace_phone_numbers(txt, replace_with="") txt = preprocessing.replace_emails(txt, replace_with="") txt = preprocessing.replace_user_handles(txt, replace_with="") txt = preprocessing.normalize_repeating_chars(txt, chars=".,;:-_ ", maxn=1) txt = re.sub("\n ", " ", txt) txt = re.sub(" \n", " ", txt) txt = re.sub("\n", " ", txt) txt = re.sub(" . ", " ", txt) txt = re.sub(r"\.([A-Z])", r". \1", txt) txt = re.sub(r"\. ([A-Z])", r".\n\1", txt) # fix for some common abbreviations for abv in ['Dr', 'St', 'Mr', 'Ms', 'mt', 'Inst', 'inc', 'est']: txt = re.sub(abv + "\.\n", abv + ". ", txt) return txt
def html2text(html): """HTML to text converter Args: html (str): html Returns: str: html page content in plaintext """ if not html: return '' # remove code snippets html = re.sub(r'<pre>.*?</pre>', ' ', html, flags=re.DOTALL | re.MULTILINE | re.IGNORECASE) html = re.sub(r'<code>.*?</code>', ' ', html, flags=re.DOTALL | re.MULTILINE | re.IGNORECASE) # strip the rest s = MLStripper() s.feed(html) text = s.get_data() text = preprocessing.normalize_whitespace(text) return text
def clean_textacy(textfile): text = open(textfile).read() text = preprocessing.normalize_whitespace(text) text = preprocessing.normalize.normalize_hyphenated_words(text) text = preprocessing.normalize.normalize_quotation_marks(text) text = preprocessing.normalize.normalize_unicode(text) text = preprocessing.remove.remove_accents(text) # text=preprocessing.remove.remove_punctuation(text) text = preprocessing.replace.replace_currency_symbols(text) text = preprocessing.replace.replace_emails(text) text = preprocessing.replace.replace_hashtags(text) # text=preprocessing.replace.replace_numbers(text) text = preprocessing.replace.replace_phone_numbers(text) text = preprocessing.replace.replace_urls(text) text = preprocessing.replace.replace_user_handles(text) print(text) # now replace the original doc with cleaned version newfile = 'cleaned_' + textfile textfile2 = open(newfile, 'w') textfile2.write(text) textfile2.close() os.remove(textfile) return [newfile]
def processText(text): preprocessedText = preprocessing.normalize.normalize_unicode(text) preprocessedText = preprocessing.normalize.normalize_quotation_marks( preprocessedText) preprocessedText = preprocessing.remove.remove_accents(preprocessedText) preprocessedText = preprocessing.remove.remove_punctuation( preprocessedText) preprocessedText = preprocessing.replace.replace_emails( preprocessedText, "") preprocessedText = preprocessing.replace.replace_phone_numbers( preprocessedText, "") #preprocessedText = preprocessing.replace.replace_contractions(preprocessedText) # lemmatize the entire text # first, split the text to a list of words words = TextBlob(preprocessedText).words # then, lemmatize each word lemmatizedText = "" for w in words: lemmatizedText += "{} ".format(w.lemmatize()) # normalize the whitespaces for texts which include s.l. 'Title And I am ...' return preprocessing.normalize_whitespace(lemmatizedText)
def preprocess_sentence(sent, lower=True): """Pre-process a sentence ( via ``textacy.preprocess` module ). Args: sent (str): text. lower (bool): whether to return a lowercase string. Returns: str """ # normalize unicode sent = preprocessing.normalize_unicode(sent) # deaccent sent = preprocessing.remove_accents(sent) # replace newline chars sent = re.sub("\n|\r", " ", sent) # unpack contractions sent = contractions.fix(sent) # replace emoji symbols sent = preprocessing.replace_emojis(sent) # replace hashtags sent = preprocessing.replace_hashtags(sent) # replace user handles sent = preprocessing.replace_user_handles(sent) # replace currency symbols sent = preprocessing.replace_currency_symbols(sent) # replace emails sent = preprocessing.replace_emails(sent) # replace URLs sent = preprocessing.replace_urls(sent) # remove punctuation sent = preprocessing.remove_punctuation(sent) # normalize whitespace sent = preprocessing.normalize_whitespace(sent) if lower: sent = sent.lower() return sent
def test_plaintext_functionality(text): preprocessed_text = preprocessing.normalize_whitespace(text) preprocessed_text = preprocessing.remove_punctuation(text) preprocessed_text = preprocessed_text.lower() assert all(char.islower() for char in preprocessed_text if char.isalpha()) assert all(char.isalnum() or char.isspace() for char in preprocessed_text) keyword = "America" kwics = text_utils.keyword_in_context(text, keyword, window_width=35, print_only=False) for pre, kw, post in kwics: assert kw == keyword assert isinstance(pre, compat.unicode_) assert isinstance(post, compat.unicode_)
def text_cleanup(text): "cleanup our text" text = preprocessing.replace_emails(text, replace_with='') text = preprocessing.replace_urls(text, replace_with='') text = preprocessing.replace_hashtags(text, replace_with='') text = preprocessing.replace_phone_numbers(text, replace_with='') text = preprocessing.replace_numbers(text, replace_with='') text = preprocessing.remove_accents(text) text = preprocessing.remove_punctuation(text) text = preprocessing.normalize_quotation_marks(text) text = preprocessing.normalize_hyphenated_words(text) text = text.replace('\n', ' ').replace('\t', ' ') text = text.lower() text = preprocessing.normalize_whitespace(text) return text
def textacy_preprocess(sentence): """Preprocess text.""" sentence = preprocessing.normalize_hyphenated_words(sentence) sentence = preprocessing.normalize_quotation_marks(sentence) #sentence = preprocessing.normalize_repeating_chars(sentence) sentence = preprocessing.normalize_unicode(sentence) sentence = preprocessing.normalize_whitespace(sentence) sentence = preprocessing.remove_accents(sentence) sentence = preprocessing.remove_punctuation(sentence) sentence = preprocessing.replace_currency_symbols(sentence) sentence = preprocessing.replace_emails(sentence) sentence = preprocessing.replace_emojis(sentence) sentence = preprocessing.replace_hashtags(sentence) sentence = preprocessing.replace_numbers(sentence) sentence = preprocessing.replace_phone_numbers(sentence) sentence = preprocessing.replace_urls(sentence) sentence = preprocessing.replace_user_handles(sentence) return sentence
def load(path): email_text = extract_email_text(path) if not email_text: return [] # use textacy to do the processing, remove the whitesapace, punctuation email_text = preprocessing.normalize_whitespace( preprocessing.remove_punctuation(email_text)) # remove accents and noralize unicode email_text = preprocessing.normalize_unicode( preprocessing.remove_accents(email_text)) # Tokenize the message tokens = to_tokenized_text(email_text) # Remove stopwords and stem tokens if len(tokens) > 2: # extract stemming word return [w.lemma_ for w in tokens if w not in nlp.Defaults.stopwords] return []
def preprocess_doc(self, doc): # pre-process data doc = tp.normalize.normalize_unicode(doc) doc = tp.normalize_whitespace(doc) doc = tp.normalize_quotation_marks(doc) # apply spaCy to tokenize doc doc = self.tokenizer(doc) # build new sentences for pre-processed doc doc_new = [] for sent in doc.sents: sent_new = '' for token in sent: token = token.text token = token.replace('\n', '') token = token.replace('\t', '') token = token.strip() sent_new += token + ' ' doc_new.append(sent_new[:-1]) return doc_new
def preprocess(text): return preprocessing.normalize_whitespace(preprocessing.remove_punctuation(text))
def _clean(self, text: str): txt = text.strip() # txt = preprocessing.normalize_unicode(txt, form="NFKC") # txt = preprocessing.remove_punctuation(txt) # Collapse whitespaces txt = preprocessing.normalize_whitespace(txt) # Remove newlines txt = preprocessing.normalize_repeating_chars(txt, chars="\n", maxn=1) # fix hyphen-ated words txt = preprocessing.normalize_hyphenated_words(txt) txt = preprocessing.normalize_quotation_marks(txt) txt = preprocessing.replace_urls(txt, replace_with="") txt = preprocessing.replace_phone_numbers(txt, replace_with="") txt = preprocessing.replace_emails(txt, replace_with="") txt = preprocessing.replace_user_handles(txt, replace_with="") txt = preprocessing.normalize_repeating_chars(txt, chars=".,;:-_ ", maxn=1) txt = re.sub("\n ", " ", txt) txt = re.sub(" \n", " ", txt) txt = re.sub("\n", " ", txt) txt = re.sub(" . ", " ", txt) # txt = text.encode().decode("unicode-escape") # Used ftfy for "fixing" broken text, e.g. Unicode # txt = fix_text(txt.strip(), normalization="NFKC") # re- minissence => reminissence # txt = re.sub(r"([a-z])\-\s{,2}([a-z])", r"\1\2", txt) # collapse two+ newlines into single whitespace # txt = re.sub(r"\s+\n{1,}\s*(\w)", r" \1", txt) # collapse two+ newlines into single whitespace # txt = re.sub("\n+", " ", txt) """ # collapse two+ newlines into single whitespace txt = re.sub(r"\s+\n{2,}\s*(\w)", r" \1", txt) # double-newlines to dots txt = re.sub(r"\n\n", ". ", txt) # collapse whitespace txt = re.sub(r"(\s){2,}", r"\1", txt) # collapse dots txt = re.sub(r"\.{2,}", ".", txt) # newline to whitespace between word characters txt = re.sub(r"(\w)\n(\w)", r"\1 \2", txt) # newline + open brace to whitespace txt = re.sub(r"(\w)\n(\()", r"\1 \2", txt) # comma + newline to whitespace txt = re.sub(r"(\w)\,\n(\w)", r"\1 \2", txt) # Number end of sentence, followed by sentence that starts with number + dot txt = re.sub(r"(\d+)\.(\d\.\s+)", r"\1. ", txt) # remove decimals + dot after whitespace followed by whitespace txt = re.sub(r"(\.\s*)\d+\.\s+", r"\1", txt) # collapse backslashes txt = re.sub(r"\\{2,}", r"\\", txt) # remove 'escaped backslash' artefacts txt = re.sub(r"\\\\", "", txt) # remove lowdash artifacts ("lines") txt = re.sub(r"_{2,}", r"", txt) # normalize newline txt = re.sub(r"\r\n", r"\n", txt) # Linebreaks starting with numbers \n77\n txt = re.sub(r"\n\d+\n", r"\n", txt) # remove quotes + decimals on beginning of sentences txt = re.sub(r"\.([\"']?)\d+\s+", r".\1", txt) # remove quotes + decimals on beginning of sentences txt = re.sub(r"\.([\"']?)\d+\s+", r".\1", txt) # collapse dots txt = re.sub(r"\.\s+\.", ". ", txt) # collapse whitespace txt = re.sub(r"(\w+)\s{2,}(\w+)", r"\1 \2", txt) # Add space+ dot with double quotes txt = re.sub(r"\.\"(\w+)", r'.". \1', txt) # Add space+ between two sentences txt = re.sub(r"([a-z])\.([A-Z])", r"\1. \2", txt) """ return txt
def preprocess_text(text, char_count_filter=True, stopwords=None, min_len=2, max_len=15): """ Pre-processing steps prior to spaCy nlp pipeline. Optional filtering of tokens based on character length. Parameters ---------- text : str char_count_filter : bool stopwords : iterable, None min_len : int max_len : int Returns ------- text : str pre-processed text """ # 1) convert to lower case for robust stop-word recognition text = text.lower() # 2) normalise text = preprocessing.normalize_quotation_marks(text) # text = preprocessing.normalize_repeating_chars(text) text = preprocessing.normalize_hyphenated_words(text) text = preprocessing.normalize_whitespace(text) # 3) replace text = preprocessing.replace_currency_symbols(text) text = preprocessing.replace_emails(text) text = preprocessing.replace_emojis(text) text = preprocessing.replace_hashtags(text) text = preprocessing.replace_numbers(text) text = preprocessing.replace_phone_numbers(text) text = preprocessing.replace_urls(text) text = preprocessing.replace_user_handles(text) # 4) remove text = preprocessing.remove_accents(text) text = preprocessing.remove_punctuation(text) text = re.sub("[^A-Za-z0-9]+", " ", text) # keep text and numbers # 5) optionally remove tokens based on length if char_count_filter & (stopwords is not None): # filter based on token length tokens = gensim.utils.simple_preprocess(doc=text, min_len=min_len, max_len=max_len) # filter case-specific words tokens = [token for token in tokens if token not in stopwords] # convert processed list of tokens back to one string text = " ".join(tokens) else: raise NotImplementedError("Not implemented.") return text
def test_normalize_whitespace(test_input, expected_result): assert preprocessing.normalize_whitespace(test_input) == expected_result
def clean_tweet(self, text): # FIXED UNICODE # text = preprocess.fix_bad_unicode(text) text = ftfy.fix_text(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL # text = preprocess.replace_urls(text) text = preprocessing.replace_urls(text) # REMOVE EMAILS # text = preprocess.replace_emails(text) text = preprocessing.replace_emails(text) # REMOVE PHONE NUMBERS # text = preprocess.replace_phone_numbers(text) text = preprocessing.replace_phone_numbers(text) # REMOVE NUMBERS # text = preprocess.replace_numbers(text) text = preprocessing.replace_numbers(text) # REMOVE CURRENCY # text = preprocess.replace_currency_symbols(text) text = preprocessing.replace_currency_symbols(text) # REMOVE ACCENTS # text = preprocess.remove_accents(text) text = preprocessing.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION # text = preprocess.remove_punct(text) text = preprocessing.remove_punctuation(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE # text = preprocess.normalize_whitespace(text) text = preprocessing.normalize_whitespace(text) return text
def test_normalize_whitespace(): text = "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. " proc_text = "Hello, world! Hello... world?\nHello:\nWorld." assert preprocessing.normalize_whitespace(text) == proc_text
binance_words = nlp(binance_string)._.combo_basic.sort_values(ascending=False).head(1000) # In[ ]: binance3000.text.to_csv('binance3000_texts.csv') # In[ ]: # %% from textacy import preprocessing df3 = preprocessing.normalize_whitespace(preprocessing.remove_punctuation(df3.text)) # %% import textacy textacy.text_utils.KWIC(strings, "language", window_width=35) # %% # %%
def process_line(self, line: str) -> Optional[str]: return normalize_whitespace(line)