def __init__(self, corpus="polish"): super().__init__() self.WORDS: Counter = corpus if isinstance( corpus, Counter) else Counter(read_stats(corpus, 1)) self.NOACCENT: Dict[str, str] = self.__prepare_noaccent() self.N: int = sum(self.WORDS.values()) self.badword_detector = BadwordDetector(self.WORDS)
def __init__(self, corpus="english"): """ :param corpus: the statistics from which corpus to use for the spell correction. """ super().__init__() self.WORDS = Counter(read_stats(corpus, 1)) self.N = sum(self.WORDS.values())
def __init__(self, corpus="english", max_split_length=20): """ Args: corpus (str): the statistics from which corpus to use for the spell correction. max_split_length (int): the maximum length of that a word can have for looking for splits """ # self.unigrams = Counter(read_stats(corpus, 1)) # self.bigrams = Counter(read_stats(corpus, 2)) self.unigrams = read_stats(corpus, 1) self.bigrams = read_stats(corpus, 2) self.N = sum(self.unigrams.values()) self.L = max_split_length self.Pw = Pdist(self.unigrams, self.N, self.unk_probability) self.P2w = Pdist(self.bigrams, self.N) self.case_split = ExManager().get_compiled()["camel_split"]
def __init__(self, corpus="polish"): self.frequencies = corpus if isinstance(corpus, Counter) else Counter( read_stats(corpus, 1)) self.badwords: Set[str] = set(BADWORD_LIST) self.censored: Dict[str, str] = self.__build_censored()
from collections import Counter from ekphrasis.classes.polish.badwords import BadwordDetector from ekphrasis.classes.polish.polishcorrect import PolishCorrector from ekphrasis.classes.preprocessor import TextPreProcessor from ekphrasis.classes.tokenizer import SocialTokenizer from ekphrasis.dicts.emoticons import emoticons from ekphrasis.utils.helpers import read_stats polish_unigrams = Counter(read_stats("polish", 1)) text_processor = TextPreProcessor( onstart=[BadwordDetector(corpus=polish_unigrams).correct_text], normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number' ], annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="polish", corrector="polish", unpack_hashtags=True, unpack_contractions=False, spell_correct_elong=False, mode="slow", tokenizer=SocialTokenizer(lowercase=True).tokenize, correction_method=PolishCorrector(corpus=polish_unigrams).correct_text, dicts=[emoticons]) sentences = [