Ejemplo n.º 1
0
def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [
        word for word in nopunc.split()
        if word not in stopwords.word('english')
    ]
    fdlist = nltk.FreqDist(all_words)

    words_df = pd.DataFrame({'word':list(fdist.keys()),'count':list(fdist.values())})


    d = words_df.nlargest(columns="count",n=terms)

    plt.figure(figsie=(12,15))
    ax = sns.barplot(data=d,x="count",y="word")
    ax.set(ylabel='Word')
    plt.show()


freq_words(movies_new['clean_plot'],100)

stop_words = set(stopwords.word('english'))


def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)


movies_new['clean_plot'] = movies_new['clean_plot'].apply(lambda x: remove_stopwords(x))


multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])

y = multilabel_binarizer.transform(movies_new['genre_new'])
Ejemplo n.º 3
0
from pathlib import Path
from textblob import TextBlob
from textblob import Word

blob=TextBlob(Path('RomeoAndJuliet.txt').read_text())

blob.words.count('Joy')

happy=Word('happy')

#print(happy.definitions)

print(happy.synsets)

synonyms=set()

for synset in happy.synsets:
    for lemma in synset.lemmas():
        synonyms.add(lemma.name())

print(synonyms)

from nltk.corpus import stopwords
stops=stopwords.word("english")

Ejemplo n.º 4
0
from nltk.corpus import stopwords
from matplotlib.colors import ListedColormap
#pip install wordcloud (no anaconda prompt)
from wordcloud import WordCloud
#criando o corpus
corpus = PlaintextCorpusReader("C:/Users/Alex/Desktop/Projetos/Python/dados/mineracao_texto", ".*")
#importando os arquivos ids
arquivos = corpus.fileids()
#visualizando um arquivo especifico
arquivos[0:10]
#acessando o texto de um arquivo especifico
texto_de_um_arquivo = corpus.raw("1.txt")
texto_do_corpus = corpus.raw()
#visualizando palavras
palavras = corpus.words()
#palavras sem semantica
stops = stopwords.word('english')
#criando nuvem de palavras
nuvem = WordCloud(background_color='white',
                  colormap=ListedColormap(['orange', 'green', 'red', 'magenta']),
                  stopwords=stopwords.word('english'),
                  max_words=100)
nuvem.generate(texto_do_corpus)
plt.imshow(nuvem)
#visualizando os termos mais frequentes
palavras_stop_word = [p for p in palavras if p not in stops]
frequencia = nltk.FreDist(palavras_stop_word)
mais_comuns = frequencia.most_common(100)


Ejemplo n.º 5
0
def tokenize_words(input):
     input=input.lower
     tokenizer=RegexpTokenizer(r'\w+')
     tokens=token.tokenize(input)
     filtered=filter(lambda token:token not in stopwords.word("english"),tokens)
     return "".join(filtered)
Ejemplo n.º 6
0
from pathlib import Path
from textblob import TextBlob
from textblob import Word

blob = TextBlob(Path('RomeoAndJuliet.txt').read_text())

print(blob.words.count(
    'Joy'))  ### counts how many times 'joy' is said in the play

happy = Word('happy')
print(
    happy.definitions
)  # searches through wordnet 19 (database by Princeton) and gives defintinions of words

print(happy.synsets)  # gets synonyms --> all of these are lemma objects

synonyms = set()  # by using a set we eliminate duplicates
for synset in happy.synsets:
    for x in synset.lemmas(
    ):  # lemma object represents all the synonyms (because theya re lemma objects)
        synonyms.add(x.name(
        ))  #lemma.name method adds specific synonyms from earlier set
print(synonyms)  ### just gives all the words without the weird stuff with them

#########################################################################################################
# pip install nltk
from nltk.corpus import stopwords

stops = stopwords.word(
    "english")  #supposed to give all the stopwords in the english language
Ejemplo n.º 7
0
import email
from nltk import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.word("english")
def feature_extraction (email_text):
    b = email.message_from_text(email_text)
    return b.payload()
    
Ejemplo n.º 8
0
messages.hist(column='length', by='label', bins=60, figsize=(12, 4))

mess = 'sample message ! notice that: it contains punctuations'

nopunc = [c for c in mess if c not in string.punctuation]
print(nopunc)

# stopwords.words('english')

nopunc = ''.join(nopunc)

nopunc.split()
clean_mess = [
    word for word in nopunc.split()
    if word.lower() not in stopwords.word('englis')
]


# Text processing part
def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [
        word for word in nopunc.split()
        if word not in stopwords.word('english')
    ]


messages.head().apply(text_process)
Ejemplo n.º 9
0
import pandas as pd
from os.path import dirname
from os.path import join
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from pyspark import SparkConf, SparkContext

# Read the input file
app_path = dirname(__file__)
raw_data = pd.read_csv(join(app_path, 'proposistions.csv'),
                       skip_blank_lines=True)
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize((" ".join(raw_data['text'])).lower())

# Exclude stop words
fr_stops = set(stopwords.word('french'))
tokens = [
    re.sub('[\W_]+', ' ', token) for token in tokens if token not in fr_stops
]

# Initialize Spark context
conf = SparkConf().setMaster("local").setAppName("Wordcount")
spark_rdd = sc.parallelize(tokens)

# Spark Map Reduce
Wordcounts = spark_rdd.flatMap(lambda line: line.split(" ")) \
                               .map(lambda word: (word, 1)) \
                               .reduceByKey(lambda a, b: a + b) \
                               .map(lambda x:(x[1],x[0])) \
                               .sortByKey(False)
Ejemplo n.º 10
0
def stopwords_removal(obj):
    e_f_stop_words = set(stopwords.word('enlgish')) | set(
        stopwords.words('french'))
    return set([words.lower() for words in obj if not words in e_f_stop_words])
Ejemplo n.º 11
0
import nltk
from bs4 import BeautifulSoup
import urllib.request
from nltk.corpus import stopwords

response = urllib.request.urlopen('https://en.wikipedia.org/wiki/SpaceX')
html = response.read()
print(html)

soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text(strip=True)

print(text)

tokens = [t for t in text.split()]
print(tokens)

sr = stopwords.words('english')

clean_tokens = tokens[:]
for token in tokens:
    if token in stopwords.word('english'):
        clean_tokens.remove(token)

freq = nltk.FreqDist(clean_tokens)
for key, val in freq.items():
    print(str(key) + ':' + str(val))

freq.plot(20, cumulative=False)