Python read_stats Exemples, ekphrasis.utils.helpers.read_stats Python Exemples

Exemple #1

0

Afficher le fichier

 def __init__(self, corpus="polish"):
     super().__init__()
     self.WORDS: Counter = corpus if isinstance(
         corpus, Counter) else Counter(read_stats(corpus, 1))
     self.NOACCENT: Dict[str, str] = self.__prepare_noaccent()
     self.N: int = sum(self.WORDS.values())
     self.badword_detector = BadwordDetector(self.WORDS)

Exemple #2

0

Afficher le fichier

Fichier : spellcorrect.py Projet : ADM36/Drug_Use_Via_Twitter

    def __init__(self, corpus="english"):
        """

        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = Counter(read_stats(corpus, 1))
        self.N = sum(self.WORDS.values())

Exemple #3

0

Afficher le fichier

    def __init__(self, corpus="english", max_split_length=20):
        """
        Args:
            corpus (str): the statistics from which corpus to use for
                the spell correction.
            max_split_length (int): the maximum length of that a word can have
                for looking for splits
        """

        # self.unigrams = Counter(read_stats(corpus, 1))
        # self.bigrams = Counter(read_stats(corpus, 2))
        self.unigrams = read_stats(corpus, 1)
        self.bigrams = read_stats(corpus, 2)
        self.N = sum(self.unigrams.values())
        self.L = max_split_length

        self.Pw = Pdist(self.unigrams, self.N, self.unk_probability)
        self.P2w = Pdist(self.bigrams, self.N)

        self.case_split = ExManager().get_compiled()["camel_split"]

Exemple #4

0

Afficher le fichier

Fichier : badwords.py Projet : sdadas/ekphrasis

 def __init__(self, corpus="polish"):
     self.frequencies = corpus if isinstance(corpus, Counter) else Counter(
         read_stats(corpus, 1))
     self.badwords: Set[str] = set(BADWORD_LIST)
     self.censored: Dict[str, str] = self.__build_censored()

Exemple #5

0

Afficher le fichier

from collections import Counter

from ekphrasis.classes.polish.badwords import BadwordDetector
from ekphrasis.classes.polish.polishcorrect import PolishCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.utils.helpers import read_stats

polish_unigrams = Counter(read_stats("polish", 1))
text_processor = TextPreProcessor(
    onstart=[BadwordDetector(corpus=polish_unigrams).correct_text],
    normalize=[
        'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date',
        'number'
    ],
    annotate={
        "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'
    },
    fix_html=True,
    segmenter="polish",
    corrector="polish",
    unpack_hashtags=True,
    unpack_contractions=False,
    spell_correct_elong=False,
    mode="slow",
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    correction_method=PolishCorrector(corpus=polish_unigrams).correct_text,
    dicts=[emoticons])

sentences = [