def remove_stopwords(data): table = str.maketrans('', '', string.punctuation) result = [] for word in data: if (word.isalpha()): word = word.strip() word = word.translate(table) word = word.strip() if len(word) > 2: try: if stp[word.strip()] != 1: result.append(str(Stemmer.stem(word)).lower()) except KeyError: result.append(str(Stemmer.stem(word)).lower()) return result
def porterStemmer(string): """ Accepts a string and optionally a stemmer function working on single words, it defaults to the nltk PorterStemmer algorithm. Returns a stemmed string. """ return Stemmer.stem(string)
s = re.sub(r'[.,!?;:{}[]()-_]', '', word) # с помощью регулярных выражений удаляем знаки препинания unsymboled.append(s) listed = [s.split(" ") for s in unsymboled] # разделяем предложения на отдельные слова new = [] for sentence in listed: s = [i for i in sentence if i not in stop_words_list] # удаляем стоп-символы new.append(s) result = [] for sentence in new: s = [_stemmer.stem(i) for i in sentence] # производится стемминг result.append(s) print(result) # преобразование массива result в строку для удаления уникальных вхождений text = [" ".join(i) for i in result] text = " ".join(text) words = text.split(" ") #print(words) # Все слова по отдельности #print(text) # Сам текст newtext = '' for word in words: i = text.count(word)
def stem_words(tokens): stemmer = Stemmer() stemmed_words = [stemmer.stem(token) for token in tokens] return stemmed_words
def mainfunctioncodestem(String): stringtrimmed = String.strip() token_list = Tokenizer.ClassTokenizer.code_tokenizer(stringtrimmed) stem_dict = Stemmer.stem(token_list) return stem_dict