Python STOP_WORDS.append Examples

Programming Language: Python

Namespace/Package Name: spacy.lang.en.stop_words

Class/Type: STOP_WORDS

Method/Function: append

Examples at hotexamples.com: 2

Python STOP_WORDS.append - 2 examples found. These are the top rated real world Python examples of spacy.lang.en.stop_words.STOP_WORDS.append extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add(27)

union(23)

update(5)

remove(3)

append(2)

copy(2)

Example #1

Show file

sentence = nlp("We will go to movie after the dinner")
print(sentence)

notStopWords = [
    notStopWords.text for notStopWords in sentence if not notStopWords.is_stop
]
print(notStopWords)

stopWords = [stopWords.text for stopWords in sentence if stopWords.is_stop]
print(stopWords)

#Add & Remove a new Stop Word
import nltk
STOP_WORDS = nltk.corpus.stopwords.words('english')
STOP_WORDS.append('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import nltk

STOP_WORDS.remove('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add("Test")

Example #2

Show file

File: vectorize_text.py Project: jasminetanom/hackernews_plus_plus

from bs4 import BeautifulSoup
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer
from textacy.preprocess import preprocess_text, replace_numbers, replace_phone_numbers, replace_urls
from gensim.utils import to_utf8, tokenize
from gensim.models.phrases import Phrases, Phraser

STOP_WORDS = list(STOP_WORDS)
STOP_WORDS.append('http')
STOP_WORDS.append('www')

def strip_html(text):
    """Remove HTML characters, if any"""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def clean_text(text):
    text = text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ')
    text = strip_html(text)
    # Remove contractions, if any:
    text = preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' ')
    text = replace_urls(text, replace_with='')
    text = replace_numbers(text, replace_with='')
    return text

def tokenize_text(text):
    text = clean_text(text)
    return list(tokenize(text))