コード例 #1
0
def preprocess(texts):

    texts = str(texts)
    texts = texts.lower()
    texts = re.sub(r"(http|@)\S+", " ", texts)
    texts = demojize(texts)
    texts = re.sub(r"’", "'", texts)
    texts = re.sub("n't", "n not", texts)
    texts = re.sub("'ll", " will", texts)
    texts = re.sub("'ve", " have", texts)
    texts = re.sub(r"[^a-z\':_]", " ", texts)
    texts = re.sub(r"[0-9]+", " ", texts)
    texts = re.sub("re-[a-z]+", " ", texts)
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = re.sub(pattern, r"\1", texts)

    tokens = tokenizer(texts)
    try:
        STOP_WORDS.remove('not')
        STOP_WORDS.remove('nor')
        STOP_WORDS.remove('no')
    except:
        pass

    lemma_list = []
    for token in tokens:
        if token not in STOP_WORDS:
            lemma_list.append(token.lemma_)
    texts = ' '.join(map(str, lemma_list))
    pred_vect = vectorizer.transform([texts])
    texts = label.classes_[model.predict(pred_vect)]
    texts = ' '.join(map(str, texts))

    return texts
コード例 #2
0
print(notStopWords)

stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop]
print(stopWords)

#Add & Remove a new Stop Word
import nltk
STOP_WORDS = nltk.corpus.stopwords.words('english')
STOP_WORDS.append('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import nltk

STOP_WORDS.remove('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add("Test")

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
コード例 #3
0
#filtering the stopwords
ex1 = nlp("How do I keep looping through until the len(new_list) = len(data_list) (i.e. all the numbers are in the new list) with everything sorted without using the built in max, min, sort functions? I'm not sure if it's necessary to create a new list either.")
for word in ex1:
    if word.is_stop:
        print(word)
        
        
#another way
mylist = [word for word in ex1 if word.is_stop]

#adding/removing stopwords
print(nlp.vocab['lamao'].is_stop)
STOP_WORDS.add('lol')
print(nlp.vocab['lol'].is_stop)
STOP_WORDS.remove('lol')
print(nlp.vocab['lol'].is_stop)




########################################################
docs = nlp('Aditya went to the Tajmahal in the Agra and ate icecream there')
for token in docs.noun_chunks:
    print(token.text)  #it wll print 'the'
    

for token in docs.noun_chunks:
    print(token.root.text) #it will print with the
    
for token in docs.noun_chunks: