def remove_brackets(s: TextSeries) -> TextSeries: """ Remove content within brackets and the brackets itself. Remove content from any kind of brackets, (), [], {}, <>. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") >>> hero.remove_brackets(s) 0 Texthero dtype: object See also -------- :meth:`remove_round_brackets` :meth:`remove_curly_brackets` :meth:`remove_square_brackets` :meth:`remove_angle_brackets` """ return (s.pipe(remove_round_brackets).pipe(remove_curly_brackets).pipe( remove_square_brackets).pipe(remove_angle_brackets))
def fillna(s: TextSeries, replace_string="") -> TextSeries: """ Replaces not assigned values with empty or given string. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> import numpy as np >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) >>> hero.fillna(s) 0 I'm 1 2 3 You're dtype: object >>> hero.fillna(s, "Missing") 0 I'm 1 Missing 2 Missing 3 You're dtype: object """ return s.fillna(replace_string).astype("str")
def replace_stopwords(s: TextSeries, symbol: str, stopwords: Optional[Set[str]] = None) -> TextSeries: """ Replace all instances of `words` with symbol. By default uses NLTK's english stopwords of 179 words. Parameters ---------- s : :class:`texthero._types.TextSeries` symbol: str Character(s) to replace words with. stopwords : Set[str], optional, default=None Set of stopwords string to remove. If not passed, by default uses NLTK English stopwords. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("the book of the jungle") >>> hero.replace_stopwords(s, "X") 0 X book X X jungle dtype: object """ if stopwords is None: stopwords = _stopwords.DEFAULT return s.apply(_replace_stopwords, args=(stopwords, symbol))
def replace_urls_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() url_pattern = r"(http\S+)" urls_found_list = copy.str.extractall( url_pattern).reset_index()[0].unique() for url in urls_found_list: copy = copy.str.replace(url, _add_url_placeholder(url), regex=False) return copy
def has_content(s: TextSeries) -> TextSeries: r""" Return a Boolean Pandas Series indicating if the rows have content. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["content", np.nan, "\t\n", " "]) >>> hero.has_content(s) 0 True 1 False 2 False 3 False dtype: bool """ return (s.pipe(remove_whitespace) != "") & (~s.isna())
def replace_mentions_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() mention_pattern = r"(@[a-zA-Z0-9]+)" mentions_found_list = copy.str.extractall( mention_pattern).reset_index()[0].unique() for mention in mentions_found_list: copy = copy.str.replace(mention, _add_mention_placeholder(mention), regex=False) return copy
def replace_hashtags_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() hashtag_pattern = r"(#[a-zA-Z0-9_]+)" hashtags_found_list = copy.str.extractall( hashtag_pattern).reset_index()[0].unique() for hashtag in hashtags_found_list: copy = copy.str.replace(hashtag, _add_hashtag_placeholder(hashtag), regex=False) return copy
def too_many_uppercase(s: TextSeries) -> TextSeries: """ Says whether a string has too many uppercase characters. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("This is NeW YoRk wIth upPer leTTers") >>> hero.too_many_uppercase(s) 0 True dtype: object """ return s.apply(_too_many_uppercase)
def count_whitespaces(s: TextSeries) -> TextSeries: """ Count number of whitespaces in a string. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("This is NeW YoRk wIth upPer letters") >>> hero.count_whitespaces(s) 0 6 dtype: object """ return s.apply(_count_whitespaces)
def count_uppercase(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("This is NeW YoRk wIth upPer letters") >>> hero.count_uppercase(s) 0 5 dtype: object """ return s.apply(_count_uppercase)
def lowercase_restricted(s: TextSeries) -> TextSeries: """ Lowercase all texts in a series except for those with too many uppercase chars. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("This is NeW YoRk wIth upPer leTTers") >>> hero.lowercase_restricted(s) 0 this is new york with upper letters dtype: object """ return s.apply(_lowercase_restricted)
def fillna(s: TextSeries) -> TextSeries: """ Replaces not assigned values with empty string. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["I'm", np.NaN, pd.NA, "You're"]) >>> hero.fillna(s) 0 I'm 1 2 3 You're dtype: object """ return s.fillna("").astype("str")
def clean(s: TextSeries, pipeline=None) -> TextSeries: """ Pre-process a text-based Pandas Series, by using the following default pipeline. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.lowercase` 3. :meth:`texthero.preprocessing.remove_digits` 4. :meth:`texthero.preprocessing.remove_html_tags` 5. :meth:`texthero.preprocessing.remove_punctuation` 6. :meth:`texthero.preprocessing.remove_diacritics` 7. :meth:`texthero.preprocessing.remove_stopwords` 8. :meth:`texthero.preprocessing.remove_whitespace` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline : List[Callable[Pandas Series, Pandas Series]], optional, default=None Specific pipeline to clean the texts. Has to be a list of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Uper 9dig. he her ÄÖÜ") >>> hero.clean(s) 0 uper 9dig aou dtype: object """ if not pipeline: pipeline = get_default_pipeline() for f in pipeline: s = s.pipe(f) return s
def remove_diacritics(s: TextSeries) -> TextSeries: """ Remove all diacritics and accents. Remove all diacritics and accents from any word and characters from the given Pandas Series. Return a cleaned version of the Pandas Series. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series( ... "Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس") >>> hero.remove_diacritics(s)[0] 'Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس' """ return s.astype("unicode").apply(_remove_diacritics)
def place_emojis(s: TextSeries) -> TextSeries: """ Place back emojis in a string, replacing placeholders with emojis. Parameters ---------- s : :class:`texthero._types.TextSeries` Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("the book of the jungle :smiling_face_with_horns:") >>> hero.place_emojis(s) 0 the book of the jungle 😈 dtype: object """ return s.apply(_place_emojis)
def restore_tweets( s: TextSeries, pipeline=get_twitter_post_pipeline()) -> TextSeries: """ Pre-process a text-based Pandas Series of tweets, by using the following pipeline. Twitter pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.replace_emojis` 3. :meth:`texthero.preprocessing.replace_urls` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline : List[Callable[Pandas Series, Pandas Series]], optional, default=None Specific pipeline to clean the texts. Has to be a list of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("the book of the jungle 😈 https://example.com") >>> hero.clean_tweets(s) 0 the book of the jungle :smiling_face_with_horns: <URL> dtype: object """ if not pipeline: pipeline = get_twitter_post_pipeline() for f in pipeline: s = s.pipe(f) return s
def clean(s: TextSeries, pipeline=None) -> TextSeries: """ Pre-process a text-based Pandas Series, by using the following default pipeline. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.lowercase` 3. :meth:`texthero.preprocessing.remove_digits` 4. :meth:`texthero.preprocessing.remove_punctuation` 5. :meth:`texthero.preprocessing.remove_diacritics` 6. :meth:`texthero.preprocessing.remove_stopwords` 7. :meth:`texthero.preprocessing.remove_whitespace` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline :List[Callable[[Pandas Series], Pandas Series]] inserting specific pipeline to clean a text Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Uper 9dig. he her ÄÖÜ") >>> hero.clean(s) 0 uper 9dig aou dtype: object """ if not pipeline: pipeline = get_default_pipeline() for f in pipeline: s = s.pipe(f) return s
def check_spelling(s: TextSeries) -> TextSeries: return s.apply(_check_spelling)