def preparingSetToTrain(self, input, N=2000): self.all_words = set() wordsFreq = {} stopWords = stopwords.words('english') stemmer = RSLPStemmer() for document in input: clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', document[1]) for word in word_tokenize(clean_text, 'english'): if word.lower() not in stopWords: stemmed_word = stemmer.stem(word.lower()) if stemmed_word in wordsFreq: wordsFreq[stemmed_word] += 1 else: wordsFreq[stemmed_word] = 1 i = 0 for item in sorted(wordsFreq, key=wordsFreq.get): if (i < N): self.all_words.add(item) i += 1 t = [] for document in input: clean_text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', document[1]) aux = {} for word in word_tokenize(clean_text, 'portuguese'): if word.lower() not in stopWords: stemmed_word = stemmer.stem(word.lower()) if stemmed_word in self.all_words: aux[stemmed_word] = True for word in self.all_words: if word not in aux: aux[word] = False t.append((aux, document[0])) return t
def stemming(text, language='english'): if language == 'english': s = PorterStemmer() elif language == 'portuguese': s = RSLPStemmer() text = ' '.join([s.stem(word) for word in text.split()]) return text
def normalize_text(text): ''' Um exemplo de formas diferentes para normalizar um texto usando ferramentas do NLTK param: text Uma string com texto que será processado ''' text = text.decode('utf8') stemmer = RSLPStemmer() # Carregando um radicalizador para o PT-BR print(text) for sent in sent_tokenize(text): # Testando formas de tokenização tokens = wordpunct_tokenize(sent) print(sent) print(' wordpunct: \t%s' % ' '.join(tokens)) tokens = word_tokenize(sent) print(' word: \t%s' % ' '.join(tokens)) # Removendo stopwords tokens = remove_stopwords(tokens) print(' -stopwords: \t%s' % ' '.join(tokens)) # Radicalizando as palavras restantes tokens = [stemmer.stem(t) for t in tokens] print('radicalizado: \t%s' % ' '.join(tokens)) print('')
def clean_words(words, remove_stopwords=False, language='portuguese'): """Stems and removes stopwords from a set of word-level tokens using the RSLPStemmer. Args: words (list): Tokens to be stemmed. remove_stopwords (bool): Whether stopwords should be removed or not. language (str): Identifier of stopwords' language. Returns: List of stemmed tokens. """ # Creates the RSLP stemmer stemmer = RSLPStemmer() # Checks if stopwords are supposed to be removed if remove_stopwords: # Gathers the stopwords stop_words = stopwords.words(language) # Stems and removes the stopwords stemmed_words = [ stemmer.stem(word) for word in words if word.lower() not in stop_words ] # If stopwords are not supposed to be removed else: # Just stems the words stemmed_words = [stemmer.stem(word) for word in words] return stemmed_words
def Stemming(palavras): stemmer = RSLPStemmer() palavras_base = [] for palavra in palavras: palavras_base.append(stemmer.stem(palavra)) return palavras_base
def __init__(self): self.nlp = spacy.load('pt_core_news_md') self.vogais = [ 'a', 'á', 'à', 'â', 'ã', 'e', 'é', 'ê', 'i', 'í', 'î', 'o', 'ó', 'ô', 'õ', 'u', 'ú', 'û' ] self.stemmer = RSLPStemmer()
def __init__(self, punctuation, stoppers, stopwords, accentuation=True, stemmer="Snowball"): """ :param punctuation: list of punctuation to exclude in the text. :param stoppers: list of punctuation that defines the end of an excerpt. :param stopwords: list of stopwords to exclude. :accentuation: whether or not to exclude word accentuation. :param stemmer: "RSLP" applies RSLPStemmer('portuguese'), whereas "Snowball" applies SnowballStemmer('portuguese') (default). None value doesn't apply any Stemmer. """ self._punctuation = punctuation self._stoppers = stoppers self._stopwords = stopwords if stemmer: if stemmer == "Snowball": self._stemmer = SnowballStemmer('portuguese') elif stemmer == "RSLP": self._stemmer = RSLPStemmer('portuguese') else: print( "Invalid value for stemmer parameter. Default value will be used." ) self._stemmer = SnowballStemmer('portuguese') else: self._stemmer = stemmer self._accentuation = accentuation
def Stemming(sentence): stemmer = RSLPStemmer() phrase = [] for word in sentence: phrase.append(stemmer.stem(word.lower())) phrase.append(" ") return "".join(phrase)
def gera_indice_invertido(docs, base_dir): # Utilitários necessários tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') st = RSLPStemmer() # ------------------------------------------------ cont_arquivo = 0 for file in docs: cont_arquivo += 1 dict_arquivos[cont_arquivo] = file caminho_arquivo = os.path.join(base_dir, file) with open(caminho_arquivo, 'r') as f: txt_arquivo = f.read() palavras = tokenizer.tokenize(txt_arquivo) palavras = filtra_palavras(palavras) radical_palavras = [st.stem(palavra) for palavra in palavras ] # Obtem apenas o radical de cada palavra for palavra in radical_palavras: if palavra not in dict_indice_invertido.keys(): dict_indice_invertido[palavra] = {cont_arquivo: 1} else: if cont_arquivo not in dict_indice_invertido[palavra].keys( ): dict_indice_invertido[palavra][cont_arquivo] = 1 else: dict_indice_invertido[palavra][cont_arquivo] += 1 f.close()
def stemmer_obj_options(self, lang): if lang == 'portuguese': return [RSLPStemmer(), SnowballStemmer('portuguese')] elif lang == 'english': return [PorterStemmer(), SnowballStemmer('english')] else: print('language not supported') return None
def stemmer(self, processed_text): '''Input: processed text Output: tokens after stemming ''' st = RSLPStemmer() #st = SnowballStemmer("english") stemmed_list = set(st.stem(token) for token in processed_text) return stemmed_list
def get_idword(conn, word): result = -1 stemmer = RSLPStemmer() cursor = conn.execute('SELECT idword FROM words WHERE word = %s', stemmer.stem(word)) if cursor.rowcount > 0: result = cursor.fetchone()[0] return result
def stemming_(text): stemmer = RSLPStemmer() stemming = [] for phrase, emotion in text: preprocess = [ str(stemmer.stem(p)) for p in phrase.split() if p not in stopwords ] stemming.append((preprocess, emotion)) return stemming
def word_stemm(word): stemmer = RSLPStemmer() try: word = stemmer.stem(word) except: print(word) return word
def stemming(tokens): stemmer = RSLPStemmer() pharse = [] for word in tokens: pharse.append(stemmer.stem(word)) return pharse
def __init__(self, url_list=None, vocab=None): self.stemmer = RSLPStemmer() self.vectorizer = CountVectorizer(preprocessor=self.stemmer.stem, tokenizer=tokenizer_with_numeric, ngram_range=(1, 2)) if url_list is not None: self.fit_vocab(url_list) else: self.vectorizer.vocabulary_ = vocab self.vocab_size = len(self.vectorizer.vocabulary_)
def stemming(sentence): stemmer = RSLPStemmer() phrase = [] for word in sentence: word = unicode(word, 'utf-8') word = unicodedata.normalize("NFKD", word) phrase.append(stemmer.stem(word.lower())) return phrase
def tokenize(self, t): if self.stopwords: if t in self.stopwords: return [] sentence = t.lower() sentence = word_tokenize(sentence) aux = [] for word in sentence: if self.stopwords: if word not in self.stopwords and word not in string.punctuation: aux.append(RSLPStemmer().stem(word.lower())) else: if word not in string.punctuation: aux.append(RSLPStemmer().stem(word.lower())) phrase = [] for word in aux: phrase.append(word) return phrase
def __init__(self, db=None): if db is None: self.db = constantes.ARQ_BANCO self.stem_count = dict() self.word_count = dict() self.unidas_count = dict() self.connection = sqlite3.connect(self.db) self.cursor = self.connection.cursor() self.stemmer = RSLPStemmer() self.tokenizer = RegexpTokenizer(r'\w+')
def get_doc(folder_name): doc_list = get_doc_list(folder_name) tokenizer = RegexpTokenizer(r'\w+') #en_stop = get_stop_words('en') #p_stemmer = PorterStemmer() p_stemmer = RSLPStemmer() taggeddoc = [] texts = [] for index, i in enumerate(doc_list): # for tagged doc wordslist = [] tagslist = [] # clean and tokenize document string raw = gensim.utils.to_unicode(i, 'latin1').lower() print index, ' - ', raw, '\n' tokens = tokenizer.tokenize(raw) #print tokens # remove stop words from tokens #stopped_tokens = [i for i in tokens if not i in en_stop] #Remove StopWords stopped_tokens = [ word for word in tokens if word not in stopwords.words('portuguese') ] #print stopped_tokens # remove numbers number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens] number_tokens = ' '.join(number_tokens).split() #print number_tokens,'\n' # stem tokens #stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] #Stemming stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens] print stemmed_tokens, '\n' # remove empty length_tokens = [i for i in stemmed_tokens if len(i) > 1] # add tokens to list texts.append(length_tokens) #td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(),str(index)) td = TaggedDocument(forward_transformer(stemmed_tokens), str(index)) taggeddoc.append(td) return taggeddoc
def separaPalavras(texto): stop = nltk.corpus.stopwords.words('portuguese') stemmer = RSLPStemmer() splitter = re.compile('\\W+') lista_palavras = [] lista = [p for p in splitter.split(texto) if p != ''] for p in lista: if p.lower() not in stop: if len(p) > 1: lista_palavras.append(stemmer.stem(p).lower()) return lista_palavras
def getIdPalavra(palavra): retorno = -1 stemmer = RSLPStemmer() conexao = pymysql.connect(host=host, user=id_user, passwd=password, db=data_base) cursor = conexao.cursor() cursor.execute('select idpalavra from palavras where palavra = %s', stemmer.stem(palavra)) if cursor.rowcount > 0: retorno = cursor.fetchone()[0] cursor.close() conexao.close() return retorno
def stemmer(data): """Reduz as palavras das strings monovaloradas aos seus radicais.""" rslp = RSLPStemmer().stem # estemizador português if type(data) == list: for i in range(len(data)): data[i] = rslp(data[i]) # estemiza a palavra (a reduz ao seu radical/raiz) else: data = rslp(data) return data
def separates_words(text): stop_words = stopwords.words('portuguese') stemmer = RSLPStemmer() splitter = re.compile('\W+') list_words = [] words = [p for p in splitter.split(text) if p != ''] for word in words: if word.lower() not in stop_words: if len(word) > 1: list_words.append(stemmer.stem(word).lower()) return list_words
def _stem(text): """ Convert words to it's stem :param text: list of words :return: list of stemmed words """ stemmer = RSLPStemmer() phrase = [] for word in text: phrase.append(stemmer.stem(word.lower())) return phrase
def stemming_(text): punc = string.punctuation stemmer = RSLPStemmer() stemming = [] for phrase, emotion in text.items(): nopunc = [str(stemmer.stem(p)) for p in phrase if p not in punc] nopunc = ''.join(nopunc) preprocess = [ str(stemmer.stem(p)) for p in nopunc.split() if p not in stopwords ] stemming.append((preprocess, emotion)) return stemming
def string_steem(text): string_steem = [] stemmer = RSLPStemmer() for i in text.split(): try: string_steem.append( stemmer.stem(i.lower().translate(remove_punct_dict))) except: string_steem.append('') return ' '.join(i for i in string_steem)
def preprocess_text(self, text): tokens = [] stemmer = RSLPStemmer() for t in text.split(): # Need a better set of stopwords #if t in stopwords.words('portuguese'): #continue t = unidecode(t) t = t.lower() t = re.sub(r'\W+', '', t) t = stemmer.stem(t) tokens.append(t) return ' '.join(tokens)
def stem_string(string, minimal_length): st = RSLPStemmer() string = clean_string(string) string = string.replace('\n', ' ') text = [] for token in string.split(' '): if token != '' and len(token) > minimal_length: try: text.append(st.stem(clean_word(token))) except: text.append(clean_word(token).decode('utf8', 'ignore')) return ' '.join(text)
def search(query, document_base): st = RSLPStemmer() dict_indice_invertido, dict_arquivos = get_indice_invertido(document_base) # Realiza a consulta consultas = query.split( '|') # Primeiramente, divide a consulta pelos operadores OR existentes conjunto_final = set() conjunto = set() for consulta in consultas: # Realiza um consulta separada para cada uma anteriormente dividida consulta = consulta.split('&') count = 0 for palavra in consulta: palavra = st.stem(palavra.strip()) if count == 0: if '!' in palavra: conjunto = set(dict_arquivos.keys()) conjunto = conjunto.difference( dict_indice_invertido[st.stem( palavra.lstrip('!'))].keys()) else: try: conjunto = set(dict_indice_invertido[palavra].keys()) except KeyError: conjunto = {} else: if '!' in palavra: conjunto = conjunto.intersection( set(dict_arquivos.keys()).difference( dict_indice_invertido[st.stem( palavra.lstrip('!'))].keys())) else: try: conjunto = conjunto.intersection( set(dict_indice_invertido[palavra].keys())) except KeyError: conjunto = {} count += 1 conjunto_final = conjunto_final.union(conjunto) txt_arquivos = '' for file in conjunto_final: txt_arquivos += dict_arquivos[file] + '\n' with open("answer.txt", 'w+') as resposta: resposta.write(str(len(conjunto_final)) + '\n' + txt_arquivos) resposta.close()