Example #1
0
class snowball:
    """Snowball stemmer."""

    from snowballstemmer import EnglishStemmer

    snow = EnglishStemmer()
    stem = snow.stemWord
import re
import pdftotext
import streamlit as st
import pandas as pd
from scipy.spatial.distance import cosine
from snowballstemmer import EnglishStemmer  # Use snowball stemming for turkish stemming
from sklearn.feature_extraction.text import CountVectorizer
engStem = EnglishStemmer()
all_stopwords = [] # Add stopwords if needed.

@st.cache
def clean_pdf_page(page):  # Cleans a pdftotext page
    """Takes a long string represeting a page and returns the cleaned sentences
    
    Returns:
        list -- list of sentences
    """
    return [re.sub("\s+", " ", i.strip()) for i in page.split("\n")]


def read_pdf_file(file):
    """Converts a file to a pdftotext object
    
    Arguments:
        file {file} -- PDF File
    
    Returns:
        pdftotext.PDF -- pdftotext representation of the file
    """
    return pdftotext.PDF(file)
Example #3
0
"""Compute text similarity between Pages."""

import logging
import re
from typing import Dict, List

import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from snowballstemmer import EnglishStemmer

from scrivo.page import Page
from scrivo.utils import logtime

STEMMER = EnglishStemmer()

logger = logging.getLogger(__name__)


# Get a plaintext rendering of a page
def get_page_plaintext(p: Page) -> str:
    """Return a plaintext page rendering for tokenization."""
    # Bit of a hack, but throw in any tags and the page title
    title_text = p.meta["title"]
    tags_text = " ".join(p.meta["tags"])
    page_text = BeautifulSoup(p.html,
                              features="html.parser").get_text(separator=" ",
                                                               strip=True)
    return " ".join([title_text, tags_text, page_text])

 def __init__(self):
     self._russian_stemmer = RussianStemmer()
     self._english_stemmer = EnglishStemmer()
     self._russian_stops = stopwords.words("russian")
     self._english_stops = stopwords.words("english")
     self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"}
class TopicCleaner:
    def __init__(self):
        self._russian_stemmer = RussianStemmer()
        self._english_stemmer = EnglishStemmer()
        self._russian_stops = stopwords.words("russian")
        self._english_stops = stopwords.words("english")
        self._label_bl = {"блог компании", "черная дыра", "я пиарюсь"}

    def clean(self, topic: Topic) -> CleanTopic:
        text = self.clean_text(topic.text)
        labels = self.clean_labels(topic.tags + topic.hubs)

        return CleanTopic(labels=labels, words=text)

    def clean_text(self, text: str) -> list:
        text = text.lower()
        text = TopicCleaner.delete_non_word_chars(text)
        tokens = TopicCleaner.tokenize_text(text)
        tokens = TopicCleaner.filter_variable_names(tokens)
        tokens = self.filter_stopwords(tokens)
        tokens = self.stemm_text(tokens)
        tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens)
        tokens = TopicCleaner.filter_words_with_unusual_for_language_length(tokens)

        return tokens

    def clean_labels(self, labels: list) -> list:
        return [self.clean_label(label) for label in self.filter_bl_labels(labels)]

    def clean_label(self, label: str) -> str:
        label = label.lower()
        label = label.replace("ё", "е")
        label_words = TopicCleaner.tokenize_text(label)
        label_words = self.stemm_text(label_words)
        return " ".join(label_words)

    def filter_bl_labels(self, labels: list) -> list:
        return set(labels) - self._label_bl

    @staticmethod
    def tokenize_text(text: str) -> list:
        return regexp_tokenize(text, "[\\w']+")

    def stemm_text(self, text: list) -> list:
        stemmed = self._english_stemmer.stemWords(text)
        return self._russian_stemmer.stemWords(stemmed)

    def filter_stopwords(self, text: list) -> list:
        return [word for word in text if word not in self._russian_stops and word not in self._english_stops]

    @staticmethod
    def filter_words_with_repeatable_letters(text: list) -> list:
        return [word for word in text if not re.match("(.)\\1{2}", word)]

    @staticmethod
    def filter_words_with_unusual_for_language_length(text: list) -> list:
        return [word for word in text if TopicCleaner.is_language_usual_word(word)]

    @staticmethod
    def is_language_usual_word(word: str) -> bool:
        length = len(word)
        is_eng = re.match("[a-z]", word)
        return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

    @staticmethod
    def filter_variable_names(text: list) -> list:
        return [word for word in text if "_" not in word]

    @staticmethod
    def delete_non_word_chars(text: str):
        temp = text.replace("ё", "е")
        temp = re.sub(r"(&[a-z0-9]*;)", " ", temp)  # & encoded symbols
        temp = re.sub(r"(\W|\d)+", " ", temp)  # non word or digit
        temp = re.sub(r"\s+", " ", temp)  # 2+ spaces
        return temp.strip()
 def __init__(self):
     self._russian_stemmer = RussianStemmer()
     self._english_stemmer = EnglishStemmer()
     self._russian_stops = stopwords.words('russian')
     self._english_stops = stopwords.words('english')
     self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'}
class TopicCleaner:
    def __init__(self):
        self._russian_stemmer = RussianStemmer()
        self._english_stemmer = EnglishStemmer()
        self._russian_stops = stopwords.words('russian')
        self._english_stops = stopwords.words('english')
        self._label_bl = {'блог компании', 'черная дыра', 'я пиарюсь'}

    def clean(self, topic: Topic) -> CleanTopic:
        text = self.clean_text(topic.text)
        labels = self.clean_labels(topic.tags + topic.hubs)

        return CleanTopic(labels=labels, words=text)

    def clean_text(self, text: str) -> list:
        text = text.lower()
        text = TopicCleaner.delete_non_word_chars(text)
        tokens = TopicCleaner.tokenize_text(text)
        tokens = TopicCleaner.filter_variable_names(tokens)
        tokens = self.filter_stopwords(tokens)
        tokens = self.stemm_text(tokens)
        tokens = TopicCleaner.filter_words_with_repeatable_letters(tokens)
        tokens = TopicCleaner.filter_words_with_unusual_for_language_length(
            tokens)

        return tokens

    def clean_labels(self, labels: list) -> list:
        return [
            self.clean_label(label) for label in self.filter_bl_labels(labels)
        ]

    def clean_label(self, label: str) -> str:
        label = label.lower()
        label = label.replace('ё', 'е')
        label_words = TopicCleaner.tokenize_text(label)
        label_words = self.stemm_text(label_words)
        return ' '.join(label_words)

    def filter_bl_labels(self, labels: list) -> list:
        return set(labels) - self._label_bl

    @staticmethod
    def tokenize_text(text: str) -> list:
        return regexp_tokenize(text, '[\\w\']+')

    def stemm_text(self, text: list) -> list:
        stemmed = self._english_stemmer.stemWords(text)
        return self._russian_stemmer.stemWords(stemmed)

    def filter_stopwords(self, text: list) -> list:
        return [
            word for word in text if word not in self._russian_stops
            and word not in self._english_stops
        ]

    @staticmethod
    def filter_words_with_repeatable_letters(text: list) -> list:
        return [word for word in text if not re.match('(.)\\1{2}', word)]

    @staticmethod
    def filter_words_with_unusual_for_language_length(text: list) -> list:
        return [
            word for word in text if TopicCleaner.is_language_usual_word(word)
        ]

    @staticmethod
    def is_language_usual_word(word: str) -> bool:
        length = len(word)
        is_eng = re.match('[a-z]', word)
        return length > 2 and ((not is_eng and length < 25) or
                               (is_eng and length < 15))

    @staticmethod
    def filter_variable_names(text: list) -> list:
        return [word for word in text if '_' not in word]

    @staticmethod
    def delete_non_word_chars(text: str):
        temp = text.replace('ё', 'е')
        temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp)  # & encoded symbols
        temp = re.sub(r'(\W|\d)+', ' ', temp)  # non word or digit
        temp = re.sub(r'\s+', ' ', temp)  # 2+ spaces
        return temp.strip()