Beispiel #1
0
 def __init__(self, corpus="polish"):
     super().__init__()
     self.WORDS: Counter = corpus if isinstance(
         corpus, Counter) else Counter(read_stats(corpus, 1))
     self.NOACCENT: Dict[str, str] = self.__prepare_noaccent()
     self.N: int = sum(self.WORDS.values())
     self.badword_detector = BadwordDetector(self.WORDS)
    def __init__(self, corpus="english"):
        """

        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = Counter(read_stats(corpus, 1))
        self.N = sum(self.WORDS.values())
Beispiel #3
0
    def __init__(self, corpus="english", max_split_length=20):
        """
        Args:
            corpus (str): the statistics from which corpus to use for
                the spell correction.
            max_split_length (int): the maximum length of that a word can have
                for looking for splits
        """

        # self.unigrams = Counter(read_stats(corpus, 1))
        # self.bigrams = Counter(read_stats(corpus, 2))
        self.unigrams = read_stats(corpus, 1)
        self.bigrams = read_stats(corpus, 2)
        self.N = sum(self.unigrams.values())
        self.L = max_split_length

        self.Pw = Pdist(self.unigrams, self.N, self.unk_probability)
        self.P2w = Pdist(self.bigrams, self.N)

        self.case_split = ExManager().get_compiled()["camel_split"]
Beispiel #4
0
 def __init__(self, corpus="polish"):
     self.frequencies = corpus if isinstance(corpus, Counter) else Counter(
         read_stats(corpus, 1))
     self.badwords: Set[str] = set(BADWORD_LIST)
     self.censored: Dict[str, str] = self.__build_censored()
Beispiel #5
0
from collections import Counter

from ekphrasis.classes.polish.badwords import BadwordDetector
from ekphrasis.classes.polish.polishcorrect import PolishCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.utils.helpers import read_stats

polish_unigrams = Counter(read_stats("polish", 1))
text_processor = TextPreProcessor(
    onstart=[BadwordDetector(corpus=polish_unigrams).correct_text],
    normalize=[
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date',
        'number'
    ],
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,
    segmenter="polish",
    corrector="polish",
    unpack_hashtags=True,
    unpack_contractions=False,
    spell_correct_elong=False,
    mode="slow",
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    correction_method=PolishCorrector(corpus=polish_unigrams).correct_text,
    dicts=[emoticons])

sentences = [