def prepare_dictionary(self, questions): # stops words do nltk stop_words = nltk.corpus.stopwords.words('portuguese') # remove os caracteres com acento das stop words ''' Dois pontos (..) não é separado da palavra quando realizado a tokenizacao, porém um ponto (.) e três pontos (...) são. Por isso no código abaixo, no replace, é trocado '..' por '...' ''' stop_words = [self.__remove_special_chars(w.replace('..', '...')) for w in stop_words] # tokeniza a frase questions = questions.str.lower() phrases = [self.__tokenize_phrase(question) for question in questions] stemmer = nltk.RSLPStemmer() dicionario = set() ''' para cada palavra na frase, retirar o sufixo (se não estiver nas stopwords) e adicionar no dicionario ''' for phrase in phrases: valid_words = [stemmer.stem(word) for word in phrase if word not in stop_words and len(word) > 1] dicionario.update(valid_words) # retorna lista de palavras da frase, que não são stop words return dicionario
def _get_text_radicals(self, text): stemmer = nltk.RSLPStemmer() blob = tb(text) radicals = [stemmer.stem(word) for word in blob.words] text_radicals = " ".join(radicals) return text_radicals
def apply_preprocessors(self, column): stemmer = nltk.RSLPStemmer() whiteSpaceTokenizer = tokenize.WhitespaceTokenizer() processed_sentece = list() for each in tqdm(column, desc="StemmerPreprocessor"): filtered_sentence = list() wordish = whiteSpaceTokenizer.tokenize(each) for item in wordish: filtered_sentence.append(stemmer.stem(item)) processed_sentece.append(' '.join(filtered_sentence)) return processed_sentece
def encode_text(self, text, words_encoded): vector = [0] * len(words_encoded) words = self.__tokenize_phrase(text.replace('..', '...')) stemmer = nltk.RSLPStemmer() for word in words: if len(word) > 0: stemmed_word = stemmer.stem(word) if stemmed_word in words_encoded: index = words_encoded[stemmed_word] vector[index] += 1 return vector
def stemming(df): stemmer = nltk.RSLPStemmer() processed_sentence = list() for sentence in df.sentence: new_sentence = list() words = punctuation_token.tokenize(sentence) for word in words: if word not in without_accents_stop_words: new_sentence.append(stemmer.stem(word)) processed_sentence.append(' '.join(new_sentence)) return processed_sentence
def remove_stop_words(text): text = unidecode.unidecode(text).lower() token_punct = tokenize.WordPunctTokenizer() token = token_punct.tokenize(text) stemmer = nltk.RSLPStemmer() words = nltk.corpus.stopwords.words('portuguese') words_without_accent = [unidecode.unidecode(item) for item in words] stopwords = words + words_without_accent + list(punctuation) without_stop_words = [ stemmer.stem(item) for item in token if item not in stopwords ] return " ".join(without_stop_words)
def generate_punct_token_stemmed(self): self.punct_token = nltk.tokenize.WordPunctTokenizer() stemmer = nltk.RSLPStemmer() punctuation_list = list() for punct in punctuation: punctuation_list.append(punct) self.punctuation_stopwords = punctuation_list + self.irelevant_words processed_phrase = list() for opinion in self.review.text_pt: new_phrase = list() opinion = opinion.lower() text_words = self.punct_token.tokenize(opinion) for word in text_words: if word not in self.punctuation_stopwords: new_phrase.append(stemmer.stem(word)) processed_phrase.append(' '.join(new_phrase)) self.review["treatment_5"] = processed_phrase return self.review, self.punct_token
def calculate_entropy(df, column): df = pre_process_data(df, column, language='portuguese') tweets = [] st = nltk.RSLPStemmer() tweet_list_tokenized = list( map(lambda dr: nltk.tokenize.word_tokenize(dr, 'portuguese'), df[column])) #print(tweet_list_tokenized) tweet_list = [] for tweet in tweet_list_tokenized: tweet = list(map(lambda dr: st.stem(dr), tweet)) tweet_list = tweet_list + tweet #print("\n\n\n\n\n") #print(tweet_list) #exit() total_terms = len(tweet_list) count_termos = dict() for term in set(tweet_list): count_termos[term] = tweet_list.count(term) #print(count_termos) probabilities = [] for term in set(tweet_list): probabilities.append(count_termos[term] / total_terms) entropy_df = ss.entropy(probabilities, base=2) return entropy_df
def remove_palavras_irrelevante(): palavras_irrelevantes = nltk.corpus.stopwords.words("portuguese") frase_processada = list() for opiniao in resenha.text_pt: nova_frase = list() opiniao = opiniao.lower() palavras_texto = separador_frase(opiniao) for palavra in palavras_texto: if palavra not in palavras_irrelevantes: nova_frase.append(palavra) frase_processada.append(' '.join(nova_frase)) resenha["Tratamento"] = frase_processada # ----------DAQUI PRA CIMA REMOVE PALAVRAS IRRELAVANTES # ----------DAQUI PRA BAIXO REMOVE PONTUAÇAO= pontuacao = list() for ponto in punctuation: pontuacao.append(ponto) pontuacao_stopwords = pontuacao + palavras_irrelevantes frase_processada = list() for opiniao in resenha["Tratamento"]: nova_frase = list() palavras_texto = separador_frase(opiniao) for palavra in palavras_texto: if palavra not in pontuacao_stopwords: nova_frase.append(palavra) frase_processada.append(' '.join(nova_frase)) resenha["Tratamento_2"] = frase_processada # --- RETIRA AS PALAVRAS COM SEMANTICA PARECIDA stemmer = nltk.RSLPStemmer() frase_processada = list() for opiniao in resenha["Tratamento_2"]: nova_frase = list() palavras_texto = separador_frase(opiniao) for palavra in palavras_texto: if palavra not in pontuacao_stopwords: nova_frase.append(stemmer.stem(palavra)) frase_processada.append(' '.join(nova_frase)) resenha["Tratamento_3"] = frase_processada
from tensorflow.keras.layers import Dropout from tensorflow.keras.layers import LSTM from tensorflow.keras.layers import Dense from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score df = pd.read_csv('train/portuguese/input_portuguese.csv') df = df.dropna() X = df.drop('label', axis=1) y = df['label'].map(lambda x: 1 if x != 'fake' else 0).tolist() X.reset_index(inplace=True) nltk.download('stopwords') nltk.download('rslp') ps = nltk.RSLPStemmer() corpus = [] for i in range(0, len(X)): review = re.sub('[^a-zA-Z]', ' ', 'os ' + X['text'][i]) review = review.lower() review = review.split() review = [ ps.stem(word) for word in review if word not in stopwords.words('portuguese') ] review = ' '.join(review) corpus.append(review) print(f'Processing {i} of {len(X)}') vocabulary_size = 5000
def _get_radical(self, palavra): radicalizador = nltk.RSLPStemmer() radical = radicalizador.stem(palavra) return radical
def summary_keywords(self): stemmer = nltk.RSLPStemmer() for sent in self.summary_tokens: for token in sent: yield stemmer.stem(token)
def predict_pt(body): prediction = predict(body['content'], '/app/neural_net/train/portuguese/output_portuguese', nltk.RSLPStemmer(), stopwords.words('portuguese')) return {'status': 'ok', 'prediction': prediction}, 200
## filter second and third paragraph paragraph_analisys = paragraph_list[2:4] ## tokenizing paragraphs tokens_second = word_tokenize(' '.join(paragraph_analisys)) ## Porter stemmer porter = PorterStemmer() stems_porter = [] for i in tokens_second: stems_porter.append(porter.stem(i)) ## Lancaster stemmer lancaster = LancasterStemmer() stems_lancaster = [] for i in tokens_second: stems_lancaster.append(lancaster.stem(i)) ## RSLP stemmer rslp = nltk.RSLPStemmer() stems_rslp = [] for i in tokens_second: stems_rslp.append(rslp.stem(i)) ## Printing porter and lancaster list of second paragraph print('Printing porter, lancaster and rslp list to compare') print(stems_porter) print(stems_lancaster) print(stems_rslp)
def Stemming(sentence): stemmer = nltk.RSLPStemmer() phrase = [] for word in sentence: phrase.append(stemmer.stem(word.lower())) return phrase
print(df_frequencia15) return df_frequencia15 """metodo que escreve um grafico pareto das frequencias recebidas""" def escreve_pareto(df_frequencia): import seaborn as sns plt.figure(figsize=(20,10)) ax = sns.barplot(data = df_frequencia, x = "Palavra", y = "Frequencia") ax.set(ylabel = "Contagem") plt.show() """metodo que remove as stop words das frases""" import string stemmer = nltk.RSLPStemmer() from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() def remove_stopwords(tokenizador, textos): stop_words = nltk.corpus.stopwords.words("portuguese") stop_words.append('nao') print(stop_words) ementas_processadas = list() for ementa in textos: nova_ementa = list() # print(ementa) palavras_ementa = tokenizador.tokenize(ementa) # remove punctuation from each word # print(palavras_ementa)
# Para processamento de linguagem natural import nltk # Para todos os tipos de pontuação que serão irrelevantes from string import punctuation # Para o uso de Regular Expressions import re # Baixar todas as partes do nltk para o PLN nltk.download('all') #Inicia os transformadores de input da nltk tokenizador = nltk.WhitespaceTokenizer() radicalizador = nltk.RSLPStemmer() stopwords = set(nltk.corpus.stopwords.words('portuguese') + list(punctuation)) # Função que pega o input do usuário e escolhe a melhor resposta def pegarResposta(bot, frase, dic, fun, respostas_bocasus, default): frase = tokenizador.tokenize(re.sub('[.,:;]+', '', frase)) resp_final = default countfun = {'conversa': 0, 'marcar': 0, 'desmarcar': 0, 'agenda': 0} funcao = 'aprender'