Beispiel #1
0
def remove_stopwords(data):
    table = str.maketrans('', '', string.punctuation)
    result = []
    for word in data:
        if (word.isalpha()):
            word = word.strip()
            word = word.translate(table)
            word = word.strip()
            if len(word) > 2:
                try:
                    if stp[word.strip()] != 1:
                        result.append(str(Stemmer.stem(word)).lower())
                except KeyError:
                    result.append(str(Stemmer.stem(word)).lower())

    return result
Beispiel #2
0
def porterStemmer(string):

    """
    Accepts a string and optionally a stemmer function working on
    single words, it defaults to the nltk PorterStemmer algorithm.

    Returns a stemmed string.
    """

    return Stemmer.stem(string)
Beispiel #3
0
    s = re.sub(r'[.,!?;:{}[]()-_]', '',
               word)  # с помощью регулярных выражений удаляем знаки препинания
    unsymboled.append(s)

listed = [s.split(" ")
          for s in unsymboled]  # разделяем предложения на отдельные слова

new = []
for sentence in listed:
    s = [i for i in sentence
         if i not in stop_words_list]  # удаляем стоп-символы
    new.append(s)

result = []
for sentence in new:
    s = [_stemmer.stem(i) for i in sentence]  # производится стемминг
    result.append(s)

print(result)

# преобразование массива result в строку для удаления уникальных вхождений
text = [" ".join(i) for i in result]
text = " ".join(text)
words = text.split(" ")

#print(words)                                                # Все слова по отдельности
#print(text)                                                 # Сам текст

newtext = ''
for word in words:
    i = text.count(word)
def stem_words(tokens):
    stemmer = Stemmer()
    stemmed_words = [stemmer.stem(token) for token in tokens]
    return stemmed_words
Beispiel #5
0
def mainfunctioncodestem(String):
    stringtrimmed = String.strip()
    token_list = Tokenizer.ClassTokenizer.code_tokenizer(stringtrimmed)
    stem_dict = Stemmer.stem(token_list)
    return stem_dict