sentence = nlp("We will go to movie after the dinner") print(sentence) notStopWords = [ notStopWords.text for notStopWords in sentence if not notStopWords.is_stop ] print(notStopWords) stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop] print(stopWords) #Add & Remove a new Stop Word import nltk STOP_WORDS = nltk.corpus.stopwords.words('english') STOP_WORDS.append('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import nltk STOP_WORDS.remove('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import spacy from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add("Test")
from bs4 import BeautifulSoup import spacy from spacy.lang.en.stop_words import STOP_WORDS import nltk from nltk.stem import WordNetLemmatizer from textacy.preprocess import preprocess_text, replace_numbers, replace_phone_numbers, replace_urls from gensim.utils import to_utf8, tokenize from gensim.models.phrases import Phrases, Phraser STOP_WORDS = list(STOP_WORDS) STOP_WORDS.append('http') STOP_WORDS.append('www') def strip_html(text): """Remove HTML characters, if any""" soup = BeautifulSoup(text, "html.parser") return soup.get_text() def clean_text(text): text = text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ') text = strip_html(text) # Remove contractions, if any: text = preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' ') text = replace_urls(text, replace_with='') text = replace_numbers(text, replace_with='') return text def tokenize_text(text): text = clean_text(text) return list(tokenize(text))