Esempio n. 1
0
def remove_brackets(s: TextSeries) -> TextSeries:
    """
    Remove content within brackets and the brackets itself.

    Remove content from any kind of brackets, (), [], {}, <>.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Texthero (round) [square] [curly] [angle]")
    >>> hero.remove_brackets(s)
    0    Texthero
    dtype: object

    See also
    --------
    :meth:`remove_round_brackets`
    :meth:`remove_curly_brackets`
    :meth:`remove_square_brackets`
    :meth:`remove_angle_brackets`

    """

    return (s.pipe(remove_round_brackets).pipe(remove_curly_brackets).pipe(
        remove_square_brackets).pipe(remove_angle_brackets))
Esempio n. 2
0
def fillna(s: TextSeries, replace_string="") -> TextSeries:
    """
    Replaces not assigned values with empty or given string.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> import numpy as np
    >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"])
    >>> hero.fillna(s)
    0       I'm
    1          
    2          
    3    You're
    dtype: object
    >>> hero.fillna(s, "Missing")
    0        I'm
    1    Missing
    2    Missing
    3     You're
    dtype: object
    """

    return s.fillna(replace_string).astype("str")
Esempio n. 3
0
def replace_stopwords(s: TextSeries,
                      symbol: str,
                      stopwords: Optional[Set[str]] = None) -> TextSeries:
    """
    Replace all instances of `words` with symbol.

    By default uses NLTK's english stopwords of 179 words.

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    symbol: str
        Character(s) to replace words with.

    stopwords : Set[str], optional, default=None
        Set of stopwords string to remove. If not passed,
        by default uses NLTK English stopwords.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("the book of the jungle")
    >>> hero.replace_stopwords(s, "X")
    0    X book X X jungle
    dtype: object

    """

    if stopwords is None:
        stopwords = _stopwords.DEFAULT
    return s.apply(_replace_stopwords, args=(stopwords, symbol))
Esempio n. 4
0
def replace_urls_w_placeholder(s: TextSeries) -> TextSeries:
    copy = s.copy()
    url_pattern = r"(http\S+)"
    urls_found_list = copy.str.extractall(
        url_pattern).reset_index()[0].unique()
    for url in urls_found_list:
        copy = copy.str.replace(url, _add_url_placeholder(url), regex=False)
    return copy
Esempio n. 5
0
def has_content(s: TextSeries) -> TextSeries:
    r"""
    Return a Boolean Pandas Series indicating if the rows have content.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["content", np.nan, "\t\n", " "])
    >>> hero.has_content(s)
    0     True
    1    False
    2    False
    3    False
    dtype: bool

    """
    return (s.pipe(remove_whitespace) != "") & (~s.isna())
Esempio n. 6
0
def replace_mentions_w_placeholder(s: TextSeries) -> TextSeries:
    copy = s.copy()
    mention_pattern = r"(@[a-zA-Z0-9]+)"
    mentions_found_list = copy.str.extractall(
        mention_pattern).reset_index()[0].unique()
    for mention in mentions_found_list:
        copy = copy.str.replace(mention,
                                _add_mention_placeholder(mention),
                                regex=False)
    return copy
Esempio n. 7
0
def replace_hashtags_w_placeholder(s: TextSeries) -> TextSeries:
    copy = s.copy()
    hashtag_pattern = r"(#[a-zA-Z0-9_]+)"
    hashtags_found_list = copy.str.extractall(
        hashtag_pattern).reset_index()[0].unique()
    for hashtag in hashtags_found_list:
        copy = copy.str.replace(hashtag,
                                _add_hashtag_placeholder(hashtag),
                                regex=False)
    return copy
Esempio n. 8
0
def too_many_uppercase(s: TextSeries) -> TextSeries:
    """
    Says whether a string has too many uppercase characters.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("This is NeW YoRk wIth upPer leTTers")
    >>> hero.too_many_uppercase(s)
    0    True
    dtype: object
    """
    return s.apply(_too_many_uppercase)
Esempio n. 9
0
def count_whitespaces(s: TextSeries) -> TextSeries:
    """
    Count number of whitespaces in a string.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("This is NeW YoRk wIth upPer letters")
    >>> hero.count_whitespaces(s)
    0    6
    dtype: object
    """
    return s.apply(_count_whitespaces)
Esempio n. 10
0
def count_uppercase(s: TextSeries) -> TextSeries:
    """
    Lowercase all texts in a series.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("This is NeW YoRk wIth upPer letters")
    >>> hero.count_uppercase(s)
    0    5
    dtype: object
    """
    return s.apply(_count_uppercase)
Esempio n. 11
0
def lowercase_restricted(s: TextSeries) -> TextSeries:
    """
    Lowercase all texts in a series except for those with too many uppercase chars.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("This is NeW YoRk wIth upPer leTTers")
    >>> hero.lowercase_restricted(s)
    0    this is new york with upper letters
    dtype: object
    """

    return s.apply(_lowercase_restricted)
Esempio n. 12
0
def fillna(s: TextSeries) -> TextSeries:
    """
    Replaces not assigned values with empty string.


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"])
    >>> hero.fillna(s)
    0       I'm
    1
    2
    3    You're
    dtype: object
    """
    return s.fillna("").astype("str")
Esempio n. 13
0
def clean(s: TextSeries, pipeline=None) -> TextSeries:
    """
    Pre-process a text-based Pandas Series, by using the following default
    pipeline.

     Default pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.lowercase`
     3. :meth:`texthero.preprocessing.remove_digits`
     4. :meth:`texthero.preprocessing.remove_html_tags`
     5. :meth:`texthero.preprocessing.remove_punctuation`
     6. :meth:`texthero.preprocessing.remove_diacritics`
     7. :meth:`texthero.preprocessing.remove_stopwords`
     8. :meth:`texthero.preprocessing.remove_whitespace`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline : List[Callable[Pandas Series, Pandas Series]],
               optional, default=None
       Specific pipeline to clean the texts. Has to be a list
       of functions taking as input and returning as output
       a Pandas Series. If None, the default pipeline
       is used.
   
    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Uper 9dig.        he her ÄÖÜ")
    >>> hero.clean(s)
    0    uper 9dig aou
    dtype: object
    """

    if not pipeline:
        pipeline = get_default_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s
Esempio n. 14
0
def remove_diacritics(s: TextSeries) -> TextSeries:
    """
    Remove all diacritics and accents.

    Remove all diacritics and accents from any word and characters from the
    given Pandas Series.
    Return a cleaned version of the Pandas Series.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(
    ...     "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
    >>> hero.remove_diacritics(s)[0]
    'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس'

    """
    return s.astype("unicode").apply(_remove_diacritics)
Esempio n. 15
0
def place_emojis(s: TextSeries) -> TextSeries:
    """
    Place back emojis in a string, replacing placeholders with emojis.

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("the book of the jungle :smiling_face_with_horns:")
    >>> hero.place_emojis(s)
    0    the book of the jungle 😈
    dtype: object

    """

    return s.apply(_place_emojis)
Esempio n. 16
0
def restore_tweets(
    s: TextSeries, pipeline=get_twitter_post_pipeline()) -> TextSeries:
    """
    Pre-process a text-based Pandas Series of tweets, by using the following
    pipeline.

     Twitter pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.replace_emojis`
     3. :meth:`texthero.preprocessing.replace_urls`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline : List[Callable[Pandas Series, Pandas Series]],
               optional, default=None
       Specific pipeline to clean the texts. Has to be a list
       of functions taking as input and returning as output
       a Pandas Series. If None, the default pipeline
       is used.

    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("the book of the jungle 😈 https://example.com")
    >>> hero.clean_tweets(s)
    0    the book of the jungle :smiling_face_with_horns: <URL>
    dtype: object
    """

    if not pipeline:
        pipeline = get_twitter_post_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s
Esempio n. 17
0
def clean(s: TextSeries, pipeline=None) -> TextSeries:
    """
    Pre-process a text-based Pandas Series, by using the following default
    pipeline.

     Default pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.lowercase`
     3. :meth:`texthero.preprocessing.remove_digits`
     4. :meth:`texthero.preprocessing.remove_punctuation`
     5. :meth:`texthero.preprocessing.remove_diacritics`
     6. :meth:`texthero.preprocessing.remove_stopwords`
     7. :meth:`texthero.preprocessing.remove_whitespace`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline :List[Callable[[Pandas Series], Pandas Series]]
       inserting specific pipeline to clean a text
   
    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Uper 9dig.        he her ÄÖÜ")
    >>> hero.clean(s)
    0    uper 9dig aou
    dtype: object
    """

    if not pipeline:
        pipeline = get_default_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s
Esempio n. 18
0
def check_spelling(s: TextSeries) -> TextSeries:
    return s.apply(_check_spelling)