def preprocess(texts): texts = str(texts) texts = texts.lower() texts = re.sub(r"(http|@)\S+", " ", texts) texts = demojize(texts) texts = re.sub(r"’", "'", texts) texts = re.sub("n't", "n not", texts) texts = re.sub("'ll", " will", texts) texts = re.sub("'ve", " have", texts) texts = re.sub(r"[^a-z\':_]", " ", texts) texts = re.sub(r"[0-9]+", " ", texts) texts = re.sub("re-[a-z]+", " ", texts) pattern = re.compile(r"(.)\1{2,}", re.DOTALL) texts = re.sub(pattern, r"\1", texts) tokens = tokenizer(texts) try: STOP_WORDS.remove('not') STOP_WORDS.remove('nor') STOP_WORDS.remove('no') except: pass lemma_list = [] for token in tokens: if token not in STOP_WORDS: lemma_list.append(token.lemma_) texts = ' '.join(map(str, lemma_list)) pred_vect = vectorizer.transform([texts]) texts = label.classes_[model.predict(pred_vect)] texts = ' '.join(map(str, texts)) return texts
print(notStopWords) stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop] print(stopWords) #Add & Remove a new Stop Word import nltk STOP_WORDS = nltk.corpus.stopwords.words('english') STOP_WORDS.append('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import nltk STOP_WORDS.remove('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import spacy from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add("Test") print(len(STOP_WORDS)) print(STOP_WORDS) import spacy from spacy.lang.en.stop_words import STOP_WORDS
#filtering the stopwords ex1 = nlp("How do I keep looping through until the len(new_list) = len(data_list) (i.e. all the numbers are in the new list) with everything sorted without using the built in max, min, sort functions? I'm not sure if it's necessary to create a new list either.") for word in ex1: if word.is_stop: print(word) #another way mylist = [word for word in ex1 if word.is_stop] #adding/removing stopwords print(nlp.vocab['lamao'].is_stop) STOP_WORDS.add('lol') print(nlp.vocab['lol'].is_stop) STOP_WORDS.remove('lol') print(nlp.vocab['lol'].is_stop) ######################################################## docs = nlp('Aditya went to the Tajmahal in the Agra and ate icecream there') for token in docs.noun_chunks: print(token.text) #it wll print 'the' for token in docs.noun_chunks: print(token.root.text) #it will print with the for token in docs.noun_chunks: