def clean_component(review, contract_model, stop_words, tokenizer, puncts): """Text Cleaner: Expand Contractions, Tokenize, Remove Stopwords, Punctuation, Lemmatize, Spell Correct, Lowercase""" rev_contract_exp = list(contract_model.expand_texts([review], precise=True)) doc_tok = tokenizer(rev_contract_exp[0]) doc_lems = [ tok.lemma_ for tok in doc_tok if (tok.text not in stop_words and tok.text not in puncts and tok.pos_ != "PUNCT" and tok.pos_ != "SYM") ] lem_list = [ re.search(r'\(?([0-9A-Za-z-]+)\)?', tok).group(1) if '-' in tok else spell(remove_punct(tok)) for tok in doc_lems ] doc2vec_input = [ t.lower() for tok in lem_list for t in tok.split() if t.lower() not in stop_words ] return doc2vec_input
def clean_tweet(self, text): # FIXED UNICODE text = preprocess.fix_bad_unicode(text) # GET TEXT ONLY FROM HTML text = BeautifulSoup(text, features='lxml').getText() # UN-PACK CONTRACTIONS text = preprocess.unpack_contractions(text) # REMOVE URL text = preprocess.replace_urls(text) # REMOVE EMAILS text = preprocess.replace_emails(text) # REMOVE PHONE NUMBERS text = preprocess.replace_phone_numbers(text) # REMOVE NUMBERS text = preprocess.replace_numbers(text) # REMOVE CURRENCY text = preprocess.replace_currency_symbols(text) # REMOVE ACCENTS text = preprocess.remove_accents(text) # CONVERT EMOJIS TO TEXT words = text.split() reformed = [ self.SMILEY[word] if word in self.SMILEY else word for word in words ] text = " ".join(reformed) text = emoji.demojize(text) text = text.replace(":", " ") text = ' '.join(text.split()) # SPLIT ATTACHED WORDS text = ' '.join(re.findall('[A-Z][^A-Z]*', text)) # SPLIT UNDERSCORE WORDS text = text.replace('_', ' ') # REMOVE PUNCTUATION text = preprocess.remove_punct(text) # Remove numbers text = re.sub(r'\d', '', text) # REMOVE WORDS LESS THAN 3 CHARACTERS text = re.sub(r'\b\w{1,2}\b', '', text) # NORMALIZE WHITESPACE text = preprocess.normalize_whitespace(text) return text
def load_stopwords(): stopwords = [] for filename in glob.glob('stopwords/*.txt'): with open(filename) as fileobj: for line in fileobj: line = preprocess_unicode(line.decode('utf8').strip()) line = preprocess.remove_punct(line) if line: stopwords.append(line) return stopwords + [word.decode('utf8') for word in STOPWORDS]
def clean_text(self, raw_text): raw_text = self.strip_tags(raw_text) raw_text = raw_text.lower() raw_text = preprocess.remove_punct(raw_text) raw_text = preprocess.transliterate_unicode(raw_text) raw_text = preprocess.replace_urls(raw_text, replace_with='') raw_text = preprocess.replace_emails(raw_text, replace_with='') raw_text = preprocess.replace_phone_numbers(raw_text, replace_with='') raw_text = preprocess.replace_numbers(raw_text, replace_with='') raw_text = preprocess.replace_currency_symbols(raw_text, replace_with='') return raw_text
def test_remove_punct_marks(): text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience." proc_text = "I can t. No, I won t! It s a matter of principle ; of what s the word? conscience." assert preprocess.remove_punct(text, marks="-'\"") == proc_text
def test_remove_punct(): text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience." proc_text = "I can t No I won t It s a matter of principle of what s the word conscience " assert preprocess.remove_punct(text) == proc_text
def test_remove_punct(self): text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience." proc_text = "I cant No I wont Its a matter of principle of whats the word conscience" self.assertEqual(preprocess.remove_punct(text), proc_text)