Python tokenize Examples, texthero.preprocessing.tokenize Python Examples

Example #1

0

Show file

def term_frequency(s: pd.Series,
                   max_features: Optional[int] = None,
                   return_feature_names=False):
    """
    Represent a text-based Pandas Series using term_frequency.

    The input Series should already be tokenized. If not, it will
    be tokenized before term_frequency is calculated.

    Parameters
    ----------
    s : Pandas Series
    max_features : int, optional
        Maximum number of features to keep.
    return_features_names : Boolean, False by Default
        If True, return a tuple (*term_frequency_series*, *features_names*)


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.term_frequency(s)
    0    [1, 1, 0]
    1    [1, 0, 1]
    dtype: object
    
    To return the features_names:
    
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.term_frequency(s, return_feature_names=True)
    (0    [1, 1, 0]
    1    [1, 0, 1]
    dtype: object, ['Sentence', 'one', 'two'])

    """
    # TODO. Can be rewritten without sklearn.

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        print(_not_tokenized_warning_message)
        s = preprocessing.tokenize(s)

    tf = CountVectorizer(
        max_features=max_features,
        lowercase=False,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )
    s = pd.Series(tf.fit_transform(s).toarray().tolist(), index=s.index)

    if return_feature_names:
        return (s, tf.get_feature_names())
    else:
        return s

Example #2

0

Show file

    def test_idf_single_not_lowercase(self):
        tfidf_single_smooth = 0.7071067811865475  # TODO

        s = pd.Series("ONE one")
        s = preprocessing.tokenize(s)
        s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
        self.assertEqual(representation.tfidf(s), s_true)

Example #3

0

Show file

    def test_most_similar_simple(self):
        s = pd.Series(["one one one"])
        s = preprocessing.tokenize(s)
        df_embeddings = representation.word2vec(s, min_count=1, seed=1)

        to = "one"
        most_similar = representation.most_similar(df_embeddings, to)

        self.assertEqual(most_similar.shape, (1, ))

Example #4

0

Show file

    def test_tfidf_formula(self):
        s = pd.Series(["Hi Bye", "Test Bye Bye"])
        s = preprocessing.tokenize(s)
        s_true_index = pd.MultiIndex.from_tuples([(0, "Bye"), (0, "Hi"),
                                                  (1, "Bye"), (1, "Test")], )
        s_true = pd.Series([_tfidf(x[1], s, x[0]) for x in s_true_index],
                           index=s_true_index).astype("Sparse")

        self.assertEqual(representation.tfidf(s), s_true)

Example #5

0

Show file

    def test_word2vec(self):
        s = pd.Series(
            ["today is a beautiful day", "today is not that beautiful"])
        df_true = pd.DataFrame(
            [[0.0] * 300] * 7,
            index=["a", "beautiful", "day", "is", "not", "that", "today"],
        )

        s = preprocessing.tokenize(s)

        df_embedding = representation.word2vec(s, min_count=1, seed=1)

        self.assertEqual(type(df_embedding), pd.DataFrame)

        self.assertEqual(df_embedding.shape, df_true.shape)

Example #6

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_tfidf_formula(self):
     s = pd.Series(["Hi Bye", "Test Bye Bye"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([
         [
             1.0 * (math.log(3 / 3) + 1),
             1.0 * (math.log(3 / 2) + 1),
             0.0 * (math.log(3 / 2) + 1),
         ],
         [
             2.0 * (math.log(3 / 3) + 1),
             0.0 * (math.log(3 / 2) + 1),
             1.0 * (math.log(3 / 2) + 1),
         ],
     ])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)

Example #7

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_tfidf_single_document(self):
     s = pd.Series("a", index=["yo"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1]], index=["yo"])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)

Example #8

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_term_frequency_not_lowercase(self):
     s = pd.Series(["one ONE"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1]])
     self.assertEqual(representation.term_frequency(s), s_true)

Example #9

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_term_frequency_punctuation_are_kept(self):
     s = pd.Series(["one !"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1]])
     self.assertEqual(representation.term_frequency(s), s_true)

Example #10

0

Show file

File: test_preprocessing.py Project: rishisinha/texthero

 def test_tokenize_split_punctuation(self):
     s = pd.Series(["ready. set, go!"])
     s_true = pd.Series([["ready", ".", "set", ",", "go", "!"]])
     self.assertEqual(preprocessing.tokenize(s), s_true)

Example #11

0

Show file

 def test_count_not_lowercase(self):
     s = pd.Series(["one ONE"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1]])
     self.assertEqual(representation.count(s), s_true)

Example #12

0

Show file

def term_frequency(
    s: pd.Series,
    max_features: Optional[int] = None,
    min_df=1,
    max_df=1.0,
    return_feature_names=False,
) -> pd.Series:
    """
    Represent a text-based Pandas Series using term frequency.

    The input Series should already be tokenized. If not, it will
    be tokenized before term_frequency is calculated.

    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default to None.
        Maximum number of features to keep. Will keep all features if set to None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        Ignore terms that have a document frequency (number of documents they appear in)
        frequency strictly higher than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    return_features_names : Boolean, False by Default
        If True, return a tuple (*count_series*, *features_names*)


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize)
    >>> hero.term_frequency(s)
    0    [0.25, 0.25, 0.0]
    1    [0.25, 0.0, 0.25]
    dtype: object
    
    To return the features_names:
    
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize)
    >>> hero.term_frequency(s, return_feature_names=True)
    (0    [0.25, 0.25, 0.0]
    1    [0.25, 0.0, 0.25]
    dtype: object, ['Sentence', 'one', 'two'])

    """
    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tf = CountVectorizer(
        max_features=max_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        min_df=min_df,
        max_df=max_df,
    )

    cv_fit_transform = tf.fit_transform(s).toarray()
    total_count = np.sum(cv_fit_transform)
    s = pd.Series(np.divide(cv_fit_transform, total_count).tolist(),
                  index=s.index)

    if return_feature_names:
        return (s, tf.get_feature_names())
    else:
        return s

Example #13

0

Show file

File: test_preprocessing.py Project: rishisinha/texthero

 def test_tokenize(self):
     s = pd.Series("text to tokenize")
     s_true = pd.Series([["text", "to", "tokenize"]])
     self.assertEqual(preprocessing.tokenize(s), s_true)

Example #14

0

Show file

File: test_indexes.py Project: wurentidai/texthero

    ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text, )],
    ["remove_brackets", preprocessing.remove_brackets, (s_text, )],
    ["remove_html_tags", preprocessing.remove_html_tags, (s_text, )],
    ["tokenize", preprocessing.tokenize, (s_text, )],
    ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text, )],
    ["replace_urls", preprocessing.replace_urls, (s_text, "")],
    ["remove_urls", preprocessing.remove_urls, (s_text, )],
    ["replace_tags", preprocessing.replace_tags, (s_text, "")],
    ["remove_tags", preprocessing.remove_tags, (s_text, )],
]

test_cases_representation = [
    [
        "term_frequency",
        representation.term_frequency,
        (preprocessing.tokenize(s_text), ),
    ],
    ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text), )],
    ["pca", representation.pca, (s_numeric_lists, 0)],
    ["nmf", representation.nmf, (s_numeric_lists, )],
    ["tsne", representation.tsne, (s_numeric_lists, )],
    ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
    ["dbscan", representation.dbscan, (s_numeric_lists, )],
    ["meanshift", representation.meanshift, (s_numeric_lists, )],
]

test_cases_visualization = []

test_cases = (test_cases_nlp + test_cases_preprocessing +
              test_cases_representation + test_cases_visualization)

Example #15

0

Show file

def count(
    s: Union[TokenSeries, TextSeries],
    max_features: Optional[int] = None,
    min_df=1,
    max_df=1.0,
    binary=False,
) -> pd.DataFrame:
    """
    Represent a text-based Pandas Series using count.

    Rows of the returned DataFrame represent documents whereas 
    columns are terms. The value in the cell document-term is
    the number of the term in this document. The output is sparse.
    TODO add tutorial link

    The input Series should already be tokenized. If not, it will
    be tokenized before count is calculated.

    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default=None
        Maximum number of features to keep. Will keep all features if set to
        None.

    min_df : float in range [0.0, 1.0] or int, optional, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents,
        integer absolute counts.

    max_df : float in range [0.0, 1.0] or int, optional, default=1.0
        Ignore terms that have a document frequency (number of documents they
        appear in) frequency strictly higher than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    binary : bool, optional, default=False
        If True, all non zero counts are set to 1.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize)
    >>> hero.count(s) # doctest: +SKIP
       Sentence  one  two
    0         1    1    0
    1         1    0    1
   
    See Also
    --------

    TODO add tutorial link
    """
    # TODO. Can be rewritten without sklearn.

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tf = CountVectorizer(
        max_features=max_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        min_df=min_df,
        max_df=max_df,
        binary=binary,
    )

    tf_vectors_csr = tf.fit_transform(s)

    return pd.DataFrame.sparse.from_spmatrix(tf_vectors_csr, s.index,
                                             tf.get_feature_names())

Example #16

0

Show file

 def test_idf_single_document(self):
     s = pd.Series("a")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1]])
     self.assertEqual(representation.tfidf(s), s_true)

Example #17

0

Show file

def word2vec(
    s: pd.Series,
    size=300,
    algorithm: str = "cbow",
    num_epochs: int = 30,
    min_count: int = 5,
    window_size: int = 5,
    alpha: float = 0.03,
    max_vocab_size: int = None,
    downsample_freq: float = 0.0001,
    min_alpha: float = 0.0001,
    negative_samples: int = 5,
    workers: int = None,
    seed: int = None,
):
    """Perform Word2vec on the given Pandas Series
    
    Return a Pandas Dataframe of shape (vocabulary_size, vectors_size).

    Word2vec is a two-layer neural network used to map each word to its vector representation. In general, its input is a text corpus and its output is a set of vectors: feature vectors that represent words in that corpus. In this specific case, the input is a Pandas Series containing in each cell a tokenized text and the output is a Pandas DataFrame where indexes are words and columns are the vector dimensions.

    Under the hoods, this function makes use of Gensim Word2Vec module.

    The input Series should already be tokenized. If not, it will
    be tokenized before word2vec is applied.

    
    Parameters
    ----------
    s : Pandas Series
    size : int, optional, default is 300
        Size of the returned vector. A good values is anything between 100-300. For very large dataset, a smaller values requires less training time.
    algorithm : str, optional, default is "cbow".
        The training algorithm. It can be either "skipgram" or "cbow". 
        With CBOW (continuous bag-of-words) the model predicts the current word from a window of surrounding context words. 
        In the continuous skip-gram mode, the model uses the current word to predict the surrounding window of context words.
        According to the authors, CBOW is faster while skip-gram is slower but does a better job for infrequent words.
    num_epochs : int, optional, default is 30
        Number of epochs to train the model.
    min_count : int, optional, default is 5
        Keep only words with a frequency equal or higher than min_count.
    window_size : int, optional, default is 5
        Surrounding window size of context words.
    alpha : float, optional, default is 0.03
        Initial learning rate
    max_vocab_size : int, optional, default to None
        Maximum number of words to keep. This corresponds to the length of the returned DataFrame. 
    downsample_freq : float, optional, default to 0.0001 (10^-4)
        Threshold frequency to downsample very frequent words. The results is similar to remove stop-words. The random removal of tokens is executed before word2vec is executed, reducing the distance between words. 
    min_alpha : float, default to 0.0001 (10^-4)
        The learning rate will drop linearly to min_alpha during training.
    negative_samples : int, optional, 5 by default
        Number of negative samples to use. Negative sampling addresses 
        the problem of avoding updating all weights at each epoch. It does so by selecting and modifing during each epoch only a small percentage of the total weights.

        The authors of the paper suggests to set negative sampling to 5-20 words for smaller datasets, and 2-5 words for large datasets.
    workers : int, optional, None by default.
        For improved performance, by default use all available computer workers. When set, use the same number of cpu.
    seed : int, optional, None by default.
        Seed for the random number generator. All vectors are initialized randomly using an hash function formed by the concatenation of the word itself and str(seed). Important: for a fully deterministically-reproducible run, you must set the model to run on a single worker thread (workers=1).

    See Also
    --------
    `Word2Vec Tutorial - The Skip-Gram Model <http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/>`_ and `Word2Vec Tutorial Part 2 - Negative Sampling <http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/>`_ for two great tutorial on Word2Vec

    """
    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        print(_not_tokenized_warning_message)
        s = preprocessing.tokenize(s)

    if algorithm == "cbow":
        sg = 0
    elif algorithm == "skipgram":
        sg = 1
    else:
        raise ValueError("algorithm must be either 'cbow' or 'skipgram'")

    w2v_model = Word2Vec(
        size=size,
        min_count=min_count,
        window=window_size,
        alpha=alpha,
        max_vocab_size=max_vocab_size,
        sample=downsample_freq,
        seed=seed,
        min_alpha=min_alpha,
        negative=negative_samples,
        sg=sg,
    )

    w2v_model.build_vocab(s.values, progress_per=10000)

    if len(w2v_model.wv.vocab.keys()) == 0:
        print("Vocabulary ...")

    w2v_model.train(
        s.values,
        total_examples=w2v_model.corpus_count,
        epochs=num_epochs,
        report_delay=1,
    )

    all_vocabulary = sorted(list(set(w2v_model.wv.vocab.keys())))

    return pd.DataFrame(data=w2v_model.wv[all_vocabulary],
                        index=all_vocabulary)

Example #18

0

Show file

File: representation.py Project: zsq-cv/texthero

def tfidf(
    s: pd.Series,
    max_features=None,
    min_df=1,
    max_df=1.0,
) -> pd.Series:
    """
    Represent a text-based Pandas Series using TF-IDF.

    *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to
    calculate the _relative importance_ of the words in a document, taking
    into account the words' occurences in other documents. It consists of two
    parts:

    The *term frequency (tf)* tells us how frequently a term is present in a
    document, so tf(document d, term t) = number of times t appears in d.

    The *inverse document frequency (idf)* measures how _important_ or
    _characteristic_ a term is among the whole corpus (i.e. among all
    documents). Thus, idf(term t) = log((1 + number of documents) /
    (1 + number of documents where t is present)) + 1.

    Finally, tf-idf(document d, term t) = tf(d, t) * idf(t).

    Different from the `sklearn-implementation of 
    tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_
    extraction.text.TfidfVectorizer.html>`, this function does *not* normalize
    the output in any way, so the result is exactly what you get applying the
    formula described above.

    Return a Document Representation Series with the
    tfidf of every word in the document.
    TODO add tutorial link

    The input Series should already be tokenized. If not, it will
    be tokenized before tfidf is calculated.

    If working with big pandas Series, you might want to limit
    the number of features through the max_features parameter.

    Use :meth:`hero.representation.flatten` on the output to get
    a standard Pandas Series with the document vectors
    in every cell.

    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default to None.
        If not None, only the max_features most frequent tokens are used.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        Ignore terms that have a document frequency (number of documents they
        appear in) frequency strictly higher than the given threshold.
        This arguments basically permits to remove corpus-specific stop words.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize)
    >>> hero.tfidf(s)
    0  Bye     1.000000
       Hi      1.405465
    1  Bye     2.000000
       Test    1.405465
    dtype: Sparse[float64, nan]

    See Also
    --------
    `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_

    Document Representation Series: TODO add tutorial link
    """

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        norm=None,  # Disable l1/l2 normalization.
    )

    tfidf_vectors_csr = tfidf.fit_transform(s)

    # Result from sklearn is in Compressed Sparse Row format.
    # Pandas Sparse Series can only be initialized from Coordinate format.
    tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

    # Map word index to word name and keep original index of documents.
    feature_names = tfidf.get_feature_names()
    s_out.index = s_out.index.map(lambda x:
                                  (s.index[x[0]], feature_names[x[1]]))

    return s_out

Example #19

0

Show file

File: representation.py Project: zsq-cv/texthero

def term_frequency(
    s: pd.Series,
    max_features: Optional[int] = None,
    min_df=1,
    max_df=1.0,
) -> pd.Series:
    """
    Represent a text-based Pandas Series using term frequency.

    Return a Document Representation Series with the
    term frequencies of the terms for every
    document.
    TODO add tutorial link

    The input Series should already be tokenized. If not, it will
    be tokenized before term_frequency is calculated.

    Use :meth:`hero.representation.flatten` on the output to get
    a standard Pandas Series with the document vectors
    in every cell.


    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default to None.
        Maximum number of features to keep. Will keep all features if set to
        None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        Ignore terms that have a document frequency (number of documents they
        appear in) frequency strictly higher than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize)
    >>> hero.term_frequency(s)
    0  Sentence    0.2
       hey         0.2
       one         0.2
    1  Sentence    0.2
       two         0.2
    dtype: Sparse[float64, nan]

    See Also
    --------
    Document Representation Series: TODO add tutorial link
    """
    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tf = CountVectorizer(
        max_features=max_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        min_df=min_df,
        max_df=max_df,
    )

    tf_vectors_csr = tf.fit_transform(s)
    tf_vectors_coo = coo_matrix(tf_vectors_csr)

    total_count_coo = np.sum(tf_vectors_coo)
    frequency_coo = np.divide(tf_vectors_coo, total_count_coo)

    s_out = pd.Series.sparse.from_coo(frequency_coo)

    features_names = tf.get_feature_names()

    # Map word index to word name
    s_out.index = s_out.index.map(lambda x:
                                  (s.index[x[0]], features_names[x[1]]))

    return s_out

Example #20

0

Show file

def tfidf(s: pd.Series,
          max_features=None,
          min_df=1,
          return_feature_names=False):
    """
    Represent a text-based Pandas Series using TF-IDF.

    The input Series should already be tokenized. If not, it will
    be tokenized before tfidf is calculated.

    Parameters
    ----------
    s : Pandas Series
    max_features : int, optional
        Maximum number of features to keep.
    min_df : int, optional. Default to 1.
        When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    return_features_names : Boolean. Default to False.
        If True, return a tuple (*tfidf_series*, *features_names*)


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.tfidf(s)
    0    [0.5797386715376657, 0.8148024746671689, 0.0]
    1    [0.5797386715376657, 0.0, 0.8148024746671689]
    dtype: object
    
    To return the *feature_names*:
    
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.tfidf(s, return_feature_names=True)
    (0    [0.5797386715376657, 0.8148024746671689, 0.0]
    1    [0.5797386715376657, 0.0, 0.8148024746671689]
    dtype: object, ['Sentence', 'one', 'two'])
    """

    # TODO. In docstring show formula to compute TF-IDF and also avoid using sk-learn if possible.

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        print(_not_tokenized_warning_message)
        s = preprocessing.tokenize(s)

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        lowercase=False,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
    )
    s = pd.Series(tfidf.fit_transform(s).toarray().tolist(), index=s.index)

    if return_feature_names:
        return (s, tfidf.get_feature_names())
    else:
        return s

Example #21

0

Show file

def tfidf(
    s: pd.Series,
    max_features=None,
    min_df=1,
    max_df=1.0,
) -> pd.DataFrame:
    """
    Represent a text-based Pandas Series using TF-IDF.

    Rows of the returned DataFrame represent documents whereas columns are
    terms. The value in the cell document-term is the tfidf-value of the
    term in this document. The output is sparse.

    *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to
    calculate the _relative importance_ of the words in a document, taking
    into account the words' occurences in other documents. It consists of
    two parts:

    The *term frequency (tf)* tells us how frequently a term is present
    in a document, so tf(document d, term t) = number of times t appears
    in d.

    The *inverse document frequency (idf)* measures how _important_ or
    _characteristic_ a term is among the whole corpus (i.e. among all
    documents). Thus, idf(term t) = log((1 + number of documents) /
    (1 + number of documents where t is present)) + 1.

    Finally, tf-idf(document d, term t) = tf(d, t) * idf(t).

    Different from the `sklearn-implementation of tfidf
    <https://scikit-learn.org/stable/modules/generated/sklearn.feature_
    extraction.text.TfidfVectorizer.html>`, this function does *not* 
    normalize the output in any way, so the result is exactly what you 
    get applying the formula described above.

    The input Series should already be tokenized. If not, it will
    be tokenized before tfidf is calculated.

    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default=None
        If not None, only the max_features most frequent tokens are used.

    min_df : float in range [0.0, 1.0] or int, optional, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents, 
        integer absolute counts.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        Ignore terms that have a document frequency (number of documents they
        appear in) frequency strictly higher than the given threshold.
        This arguments basically permits to remove corpus-specific stop 
        words. If float, the parameter represents a proportion of documents,
        integer absolute counts.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize)
    >>> hero.tfidf(s) # doctest: +SKIP                    
        Bye        Hi      Test
    0   1.0  1.405465  0.000000
    1   2.0  0.000000  1.405465

    See Also
    --------
    `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_

    TODO add tutorial link
    """

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        norm=None,  # Disable l1/l2 normalization.
    )

    tfidf_vectors_csr = tfidf.fit_transform(s)

    return pd.DataFrame.sparse.from_spmatrix(tfidf_vectors_csr, s.index,
                                             tfidf.get_feature_names())

Example #22

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_tfidf_single_not_lowercase(self):
     s = pd.Series("ONE one")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1.0, 1.0]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)

Example #23

0

Show file

def count(
    s: pd.Series,
    max_features: Optional[int] = None,
    min_df=1,
    max_df=1.0,
    return_feature_names=False,
):
    """
    Represent a text-based Pandas Series using count.

    The input Series should already be tokenized. If not, it will
    be tokenized before count is calculated.

    Parameters
    ----------
    s : Pandas Series
    max_features : int, optional
        Maximum number of features to keep.
    min_df : int, optional, default to 1.
        When building the vocabulary, ignore terms that have a document 
        frequency (number of documents a term appears in) strictly lower than the given threshold.
    max_df : int or double, optional, default to 1.0
        When building the vocabulary, ignore terms that have a document
        frequency (number of documents a term appears in) strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents.

    return_features_names : Boolean, False by Default
        If True, return a tuple (*count_series*, *features_names*)


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.count(s)
    0    [1, 1, 0]
    1    [1, 0, 1]
    dtype: object
    
    To return the features_names:
    
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Sentence one", "Sentence two"])
    >>> s = hero.tokenize(s)
    >>> hero.count(s, return_feature_names=True)
    (0    [1, 1, 0]
    1    [1, 0, 1]
    dtype: object, ['Sentence', 'one', 'two'])

    """
    # TODO. Can be rewritten without sklearn.

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tf = CountVectorizer(
        max_features=max_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        min_df=min_df,
        max_df=max_df,
    )
    s = pd.Series(tf.fit_transform(s).toarray().tolist(), index=s.index)

    if return_feature_names:
        return (s, tf.get_feature_names())
    else:
        return s

Example #24

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_tfidf_max_features(self):
     s = pd.Series("one one two")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[2.0]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s, max_features=1), s_true)

Example #25

0

Show file

def tfidf(s: pd.Series,
          max_features=None,
          min_df=1,
          max_df=1.0,
          return_feature_names=False) -> pd.Series.sparse:
    """
    Represent a text-based Pandas Series using TF-IDF.

    *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to
    calculate the _relative importance_ of the words in a document, taking
    into account the words' occurences in other documents. It consists of two parts:

    The *term frequency (tf)* tells us how frequently a term is present in a document,
    so tf(document d, term t) = number of times t appears in d.

    The *inverse document frequency (idf)* measures how _important_ or _characteristic_
    a term is among the whole corpus (i.e. among all documents).
    Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1.

    Finally, tf-idf(document d, term t) = tf(d, t) * idf(t).

    Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`,
    this function does *not* normalize the output in any way,
    so the result is exactly what you
    get applying the formula described above.

    The input Series should already be tokenized. If not, it will
    be tokenized before tfidf is calculated.

    If working with big pandas Series, you might want to limit
    the number of features through the max_features parameter.

    Parameters
    ----------
    s : Pandas Series (tokenized)

    max_features : int, optional, default to None.
        Maximum number of features to keep. Will keep all features if set to None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency (number of documents they appear in) strictly 
        lower than the given threshold.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    max_df : float in range [0.0, 1.0] or int, default=1.0
        Ignore terms that have a document frequency (number of documents they appear in)
        frequency strictly higher than the given threshold.
        This arguments basically permits to remove corpus-specific stop words.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.

    return_features_names : Boolean, False by Default
        If True, return a tuple (*count_series*, *features_names*)


    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize)
    >>> hero.tfidf(s, return_feature_names=True)
    (document
    0    [1.0, 1.4054651081081644, 0.0]
    1    [2.0, 0.0, 1.4054651081081644]
    dtype: object, ['Bye', 'Hi', 'Test'])

    See Also
    --------
    `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_

    """

    # Check if input is tokenized. Else, print warning and tokenize.
    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = preprocessing.tokenize(s)

    tfidf = TfidfVectorizer(
        use_idf=True,
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        norm=None,  # Disable l1/l2 normalization.
    )

    tfidf_vectors_csr = tfidf.fit_transform(s)

    # Result from sklearn is in Compressed Sparse Row format.
    # Pandas Sparse Series can only be initialized from Coordinate format.
    tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
    s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

    # Map word index to word name and keep original index of documents.
    feature_names = tfidf.get_feature_names()
    s_out.index = s_out.index.map(lambda x:
                                  (s.index[x[0]], feature_names[x[1]]))

    s_out.rename_axis(["document", "word"], inplace=True)

    # NOTE: Currently: still convert to flat series instead of representation series.
    # Will change to return representation series directly in Version 2.
    s_out = representation_series_to_flat_series(s_out,
                                                 fill_missing_with=0.0,
                                                 index=s.index)

    if return_feature_names:
        return s_out, feature_names
    else:
        return s_out

Example #26

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_term_frequency_single_document(self):
     s = pd.Series("a b c c")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1, 2]])
     self.assertEqual(representation.term_frequency(s), s_true)

Example #27

0

Show file

File: test_preprocessing.py Project: rishisinha/texthero

 def test_tokenize_multirows(self):
     s = pd.Series(["first row", "second row"])
     s_true = pd.Series([["first", "row"], ["second", "row"]])
     self.assertEqual(preprocessing.tokenize(s), s_true)

Example #28

0

Show file

File: test_representation.py Project: zhouxinfei/texthero

 def test_term_frequency_multiple_documents(self):
     s = pd.Series(["doc_one", "doc_two"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1, 1, 0], [1, 1, 0, 1]])
     self.assertEqual(representation.term_frequency(s), s_true)

Example #29

0

Show file

File: test_preprocessing.py Project: rishisinha/texthero

 def test_tokenize_not_split_in_between_punctuation(self):
     s = pd.Series(["don't say hello-world"])
     s_true = pd.Series([["don't", "say", "hello-world"]])
     self.assertEqual(preprocessing.tokenize(s), s_true)

Example #30

0

Show file

 def test_count_multiple_documents(self):
     s = pd.Series(["doc_one", "doc_two"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 0], [0, 1]])
     self.assertEqual(representation.count(s), s_true)