Esempi in Python per unigrams_and_bigrams

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: wordcloud.tokenization

Metodo/funzione: unigrams_and_bigrams

Esempi su hotexamples.com: 4

unigrams_and_bigrams in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per wordcloud.tokenization.unigrams_and_bigrams, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: word_cloud.py Progetto: synek/Personify

    def process_text(self, text):
        print("inside")
        stopwords = set([i.lower() for i in self.stopwords])
        regexp = re.compile(r'(' + '|'.join(TWEET_REGEXES) + ')',
                            re.VERBOSE | re.IGNORECASE)

        words = re.findall(regexp, text)
        # remove stopwords
        words = [word for word in words if word.lower() not in stopwords]

        words = [
            word for word in words
            if word not in FRONT_PUNCTUATION + USELESS_PUNCTUATION
        ]
        # remove 's
        words = [
            word[:-2] if word.lower().endswith("'s") else word
            for word in words
        ]
        # remove numbers
        words = [word for word in words if not word.isdigit()]

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts

Esempio n. 2

Mostra file

    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts

Esempio n. 3

Mostra file

    def process_text(self, text):
        """Splits a long text into words, eliminates the stopwords.

        Parameters
        ----------
        text : string
            The text to be processed.

        Returns
        -------
        words : dict (string, int)
            Word tokens with associated frequency.

        ..versionchanged:: 1.2.2
            Changed return type from list of tuples to dict.

        Notes
        -----
        There are better ways to do word tokenization, but I don't want to
        include all those things.
        """

        stopwords = set([i.lower() for i in self.stopwords])

        flags = (re.UNICODE
                 if sys.version < '3' and type(text) is unicode else 0)
        regexp = self.regexp if self.regexp is not None else r"\w[\w']+"

        words = re.findall(regexp, text, flags)
        # remove stopwords
        words = [word for word in words if word.lower() not in stopwords]
        # remove 's
        words = [
            word[:-2] if word.lower().endswith("'s") else word
            for word in words
        ]
        # remove numbers
        words = [word for word in words if not word.isdigit()]
        # remove arabic characters
        if self.only_persian:
            words = [self.remove_ar(word) for word in words]

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts

Esempio n. 4

Mostra file

File: WordCloudFa.py Progetto: sjmars/word_cloud_fa

    def process_text(self, text: str) -> Dict[str, int]:
        """
        Splits a long text into words.
        If `persian_normalize` attribute has been set to True, normalizes `text` with Hazm Normalizer.
        If `include_numbers` attribute has been set to False, removes all Persian, English and Arabic numbers from
        text`.
        Attention: this method will not remove stopwords from the input.
        :param text: The text we want to process
        :return: a dictionary. keys are words and values are the frequencies.
        """
        flags = (
            re.UNICODE if version < '3' and type(text) is unicode  # noqa: F821
            else 0)

        if self.remove_unhandled_utf_characters:
            text = WordCloudFa.unhandled_characters_regex.sub(r'', text)

        if self.persian_normalize:
            normalizer = Normalizer()
            text = normalizer.normalize(text)
        if not self.include_numbers:
            text = re.sub(r"[0-9\u06F0-\u06F9\u0660-\u0669]", "", text)

        if self.regexp:
            words = re.findall(self.regexp, text, flags)
        else:
            words = word_tokenize(text)

        if self.collocations:
            # We remove stopwords in the WordCloudFa, so there is no need for passing them in this function.
            word_counts = unigrams_and_bigrams(words, [],
                                               self.normalize_plurals,
                                               self.collocation_threshold)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts