Exemple #1
0
def get_stem(message):
    

# stemmer = SnowballStemmer("russian") 
# stemmer.stem("Василий")

    bot.send_message(message.from_user.id, RussianStemmer().stemWord(message.text))
 def __init__(self):
     self._russian_stemmer = RussianStemmer()
     self._english_stemmer = EnglishStemmer()
     self._russian_stops = stopwords.words("russian")
     self._english_stops = stopwords.words("english")
     self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"}
 def __init__(self):
     self._russian_stemmer = RussianStemmer()
     self._english_stemmer = EnglishStemmer()
     self._russian_stops = stopwords.words('russian')
     self._english_stops = stopwords.words('english')
     self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'}
class TopicCleaner:
    def __init__(self):
        self._russian_stemmer = RussianStemmer()
        self._english_stemmer = EnglishStemmer()
        self._russian_stops = stopwords.words("russian")
        self._english_stops = stopwords.words("english")
        self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"}

    def clean(self, topic: Topic) -> CleanTopic:
        text = self.clean_text(topic.text)
        labels = self.clean_labels(topic.tags + topic.hubs)

        return CleanTopic(labels=labels, words=text)

    def clean_text(self, text: str) -> list:
        text = text.lower()
        text = TopicCleaner.delete_non_word_chars(text)
        tokens = TopicCleaner.tokenize_text(text)
        tokens = TopicCleaner.filter_variable_names(tokens)
        tokens = self.filter_stopwords(tokens)
        tokens = self.stemm_text(tokens)
        tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens)
        tokens = TopicCleaner.filter_words_with_unusual_for_language_length(tokens)

        return tokens

    def clean_labels(self, labels: list) -> list:
        return [self.clean_label(label) for label in self.filter_bl_labels(labels)]

    def clean_label(self, label: str) -> str:
        label = label.lower()
        label = label.replace("ё", "е")
        label_words = TopicCleaner.tokenize_text(label)
        label_words = self.stemm_text(label_words)
        return " ".join(label_words)

    def filter_bl_labels(self, labels: list) -> list:
        return set(labels) - self._label_bl

    @staticmethod
    def tokenize_text(text: str) -> list:
        return regexp_tokenize(text, "[\\w']+")

    def stemm_text(self, text: list) -> list:
        stemmed = self._english_stemmer.stemWords(text)
        return self._russian_stemmer.stemWords(stemmed)

    def filter_stopwords(self, text: list) -> list:
        return [word for word in text if word not in self._russian_stops and word not in self._english_stops]

    @staticmethod
    def filter_words_with_repeatable_letters(text: list) -> list:
        return [word for word in text if not re.match("(.)\\1{2}", word)]

    @staticmethod
    def filter_words_with_unusual_for_language_length(text: list) -> list:
        return [word for word in text if TopicCleaner.is_language_usual_word(word)]

    @staticmethod
    def is_language_usual_word(word: str) -> bool:
        length = len(word)
        is_eng = re.match("[a-z]", word)
        return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

    @staticmethod
    def filter_variable_names(text: list) -> list:
        return [word for word in text if "_" not in word]

    @staticmethod
    def delete_non_word_chars(text: str):
        temp = text.replace("ё", "е")
        temp = re.sub(r"(&[a-z0-9]*;)", " ", temp)  # & encoded symbols
        temp = re.sub(r"(\W|\d)+", " ", temp)  # non word or digit
        temp = re.sub(r"\s+", " ", temp)  # 2+ spaces
        return temp.strip()
class TopicCleaner:
    def __init__(self):
        self._russian_stemmer = RussianStemmer()
        self._english_stemmer = EnglishStemmer()
        self._russian_stops = stopwords.words('russian')
        self._english_stops = stopwords.words('english')
        self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'}

    def clean(self, topic: Topic) -> CleanTopic:
        text = self.clean_text(topic.text)
        labels = self.clean_labels(topic.tags + topic.hubs)

        return CleanTopic(labels=labels, words=text)

    def clean_text(self, text: str) -> list:
        text = text.lower()
        text = TopicCleaner.delete_non_word_chars(text)
        tokens = TopicCleaner.tokenize_text(text)
        tokens = TopicCleaner.filter_variable_names(tokens)
        tokens = self.filter_stopwords(tokens)
        tokens = self.stemm_text(tokens)
        tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens)
        tokens = TopicCleaner.filter_words_with_unusual_for_language_length(
            tokens)

        return tokens

    def clean_labels(self, labels: list) -> list:
        return [
            self.clean_label(label) for label in self.filter_bl_labels(labels)
        ]

    def clean_label(self, label: str) -> str:
        label = label.lower()
        label = label.replace('ё', 'е')
        label_words = TopicCleaner.tokenize_text(label)
        label_words = self.stemm_text(label_words)
        return ' '.join(label_words)

    def filter_bl_labels(self, labels: list) -> list:
        return set(labels) - self._label_bl

    @staticmethod
    def tokenize_text(text: str) -> list:
        return regexp_tokenize(text, '[\\w\']+')

    def stemm_text(self, text: list) -> list:
        stemmed = self._english_stemmer.stemWords(text)
        return self._russian_stemmer.stemWords(stemmed)

    def filter_stopwords(self, text: list) -> list:
        return [
            word for word in text if word not in self._russian_stops
            and word not in self._english_stops
        ]

    @staticmethod
    def filter_words_with_repeatable_letters(text: list) -> list:
        return [word for word in text if not re.match('(.)\\1{2}', word)]

    @staticmethod
    def filter_words_with_unusual_for_language_length(text: list) -> list:
        return [
            word for word in text if TopicCleaner.is_language_usual_word(word)
        ]

    @staticmethod
    def is_language_usual_word(word: str) -> bool:
        length = len(word)
        is_eng = re.match('[a-z]', word)
        return length > 2 and ((not is_eng and length < 25) or
                               (is_eng and length < 15))

    @staticmethod
    def filter_variable_names(text: list) -> list:
        return [word for word in text if '_' not in word]

    @staticmethod
    def delete_non_word_chars(text: str):
        temp = text.replace('ё', 'е')
        temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp)  # & encoded symbols
        temp = re.sub(r'(\W|\d)+', ' ', temp)  # non word or digit
        temp = re.sub(r'\s+', ' ', temp)  # 2+ spaces
        return temp.strip()