class snowball: """Snowball stemmer.""" from snowballstemmer import EnglishStemmer snow = EnglishStemmer() stem = snow.stemWord
import re import pdftotext import streamlit as st import pandas as pd from scipy.spatial.distance import cosine from snowballstemmer import EnglishStemmer # Use snowball stemming for turkish stemming from sklearn.feature_extraction.text import CountVectorizer engStem = EnglishStemmer() all_stopwords = [] # Add stopwords if needed. @st.cache def clean_pdf_page(page): # Cleans a pdftotext page """Takes a long string represeting a page and returns the cleaned sentences Returns: list -- list of sentences """ return [re.sub("\s+", " ", i.strip()) for i in page.split("\n")] def read_pdf_file(file): """Converts a file to a pdftotext object Arguments: file {file} -- PDF File Returns: pdftotext.PDF -- pdftotext representation of the file """ return pdftotext.PDF(file)
"""Compute text similarity between Pages.""" import logging import re from typing import Dict, List import numpy as np from bs4 import BeautifulSoup from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer from snowballstemmer import EnglishStemmer from scrivo.page import Page from scrivo.utils import logtime STEMMER = EnglishStemmer() logger = logging.getLogger(__name__) # Get a plaintext rendering of a page def get_page_plaintext(p: Page) -> str: """Return a plaintext page rendering for tokenization.""" # Bit of a hack, but throw in any tags and the page title title_text = p.meta["title"] tags_text = " ".join(p.meta["tags"]) page_text = BeautifulSoup(p.html, features="html.parser").get_text(separator=" ", strip=True) return " ".join([title_text, tags_text, page_text])
def __init__(self): self._russian_stemmer = RussianStemmer() self._english_stemmer = EnglishStemmer() self._russian_stops = stopwords.words("russian") self._english_stops = stopwords.words("english") self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"}
class TopicCleaner: def __init__(self): self._russian_stemmer = RussianStemmer() self._english_stemmer = EnglishStemmer() self._russian_stops = stopwords.words("russian") self._english_stops = stopwords.words("english") self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"} def clean(self, topic: Topic) -> CleanTopic: text = self.clean_text(topic.text) labels = self.clean_labels(topic.tags + topic.hubs) return CleanTopic(labels=labels, words=text) def clean_text(self, text: str) -> list: text = text.lower() text = TopicCleaner.delete_non_word_chars(text) tokens = TopicCleaner.tokenize_text(text) tokens = TopicCleaner.filter_variable_names(tokens) tokens = self.filter_stopwords(tokens) tokens = self.stemm_text(tokens) tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens) tokens = TopicCleaner.filter_words_with_unusual_for_language_length(tokens) return tokens def clean_labels(self, labels: list) -> list: return [self.clean_label(label) for label in self.filter_bl_labels(labels)] def clean_label(self, label: str) -> str: label = label.lower() label = label.replace("ё", "е") label_words = TopicCleaner.tokenize_text(label) label_words = self.stemm_text(label_words) return " ".join(label_words) def filter_bl_labels(self, labels: list) -> list: return set(labels) - self._label_bl @staticmethod def tokenize_text(text: str) -> list: return regexp_tokenize(text, "[\\w']+") def stemm_text(self, text: list) -> list: stemmed = self._english_stemmer.stemWords(text) return self._russian_stemmer.stemWords(stemmed) def filter_stopwords(self, text: list) -> list: return [word for word in text if word not in self._russian_stops and word not in self._english_stops] @staticmethod def filter_words_with_repeatable_letters(text: list) -> list: return [word for word in text if not re.match("(.)\\1{2}", word)] @staticmethod def filter_words_with_unusual_for_language_length(text: list) -> list: return [word for word in text if TopicCleaner.is_language_usual_word(word)] @staticmethod def is_language_usual_word(word: str) -> bool: length = len(word) is_eng = re.match("[a-z]", word) return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15)) @staticmethod def filter_variable_names(text: list) -> list: return [word for word in text if "_" not in word] @staticmethod def delete_non_word_chars(text: str): temp = text.replace("ё", "е") temp = re.sub(r"(&[a-z0-9]*;)", " ", temp) # & encoded symbols temp = re.sub(r"(\W|\d)+", " ", temp) # non word or digit temp = re.sub(r"\s+", " ", temp) # 2+ spaces return temp.strip()
def __init__(self): self._russian_stemmer = RussianStemmer() self._english_stemmer = EnglishStemmer() self._russian_stops = stopwords.words('russian') self._english_stops = stopwords.words('english') self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'}
class TopicCleaner: def __init__(self): self._russian_stemmer = RussianStemmer() self._english_stemmer = EnglishStemmer() self._russian_stops = stopwords.words('russian') self._english_stops = stopwords.words('english') self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'} def clean(self, topic: Topic) -> CleanTopic: text = self.clean_text(topic.text) labels = self.clean_labels(topic.tags + topic.hubs) return CleanTopic(labels=labels, words=text) def clean_text(self, text: str) -> list: text = text.lower() text = TopicCleaner.delete_non_word_chars(text) tokens = TopicCleaner.tokenize_text(text) tokens = TopicCleaner.filter_variable_names(tokens) tokens = self.filter_stopwords(tokens) tokens = self.stemm_text(tokens) tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens) tokens = TopicCleaner.filter_words_with_unusual_for_language_length( tokens) return tokens def clean_labels(self, labels: list) -> list: return [ self.clean_label(label) for label in self.filter_bl_labels(labels) ] def clean_label(self, label: str) -> str: label = label.lower() label = label.replace('ё', 'е') label_words = TopicCleaner.tokenize_text(label) label_words = self.stemm_text(label_words) return ' '.join(label_words) def filter_bl_labels(self, labels: list) -> list: return set(labels) - self._label_bl @staticmethod def tokenize_text(text: str) -> list: return regexp_tokenize(text, '[\\w\']+') def stemm_text(self, text: list) -> list: stemmed = self._english_stemmer.stemWords(text) return self._russian_stemmer.stemWords(stemmed) def filter_stopwords(self, text: list) -> list: return [ word for word in text if word not in self._russian_stops and word not in self._english_stops ] @staticmethod def filter_words_with_repeatable_letters(text: list) -> list: return [word for word in text if not re.match('(.)\\1{2}', word)] @staticmethod def filter_words_with_unusual_for_language_length(text: list) -> list: return [ word for word in text if TopicCleaner.is_language_usual_word(word) ] @staticmethod def is_language_usual_word(word: str) -> bool: length = len(word) is_eng = re.match('[a-z]', word) return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15)) @staticmethod def filter_variable_names(text: list) -> list: return [word for word in text if '_' not in word] @staticmethod def delete_non_word_chars(text: str): temp = text.replace('ё', 'е') temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp) # & encoded symbols temp = re.sub(r'(\W|\d)+', ' ', temp) # non word or digit temp = re.sub(r'\s+', ' ', temp) # 2+ spaces return temp.strip()