def term_frequency(s: pd.Series, max_features: Optional[int] = None, return_feature_names=False): """ Represent a text-based Pandas Series using term_frequency. The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. Parameters ---------- s : Pandas Series max_features : int, optional Maximum number of features to keep. return_features_names : Boolean, False by Default If True, return a tuple (*term_frequency_series*, *features_names*) Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.term_frequency(s) 0 [1, 1, 0] 1 [1, 0, 1] dtype: object To return the features_names: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.term_frequency(s, return_feature_names=True) (0 [1, 1, 0] 1 [1, 0, 1] dtype: object, ['Sentence', 'one', 'two']) """ # TODO. Can be rewritten without sklearn. # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): print(_not_tokenized_warning_message) s = preprocessing.tokenize(s) tf = CountVectorizer( max_features=max_features, lowercase=False, tokenizer=lambda x: x, preprocessor=lambda x: x, ) s = pd.Series(tf.fit_transform(s).toarray().tolist(), index=s.index) if return_feature_names: return (s, tf.get_feature_names()) else: return s
def test_idf_single_not_lowercase(self): tfidf_single_smooth = 0.7071067811865475 # TODO s = pd.Series("ONE one") s = preprocessing.tokenize(s) s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]]) self.assertEqual(representation.tfidf(s), s_true)
def test_most_similar_simple(self): s = pd.Series(["one one one"]) s = preprocessing.tokenize(s) df_embeddings = representation.word2vec(s, min_count=1, seed=1) to = "one" most_similar = representation.most_similar(df_embeddings, to) self.assertEqual(most_similar.shape, (1, ))
def test_tfidf_formula(self): s = pd.Series(["Hi Bye", "Test Bye Bye"]) s = preprocessing.tokenize(s) s_true_index = pd.MultiIndex.from_tuples([(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], ) s_true = pd.Series([_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true)
def test_word2vec(self): s = pd.Series( ["today is a beautiful day", "today is not that beautiful"]) df_true = pd.DataFrame( [[0.0] * 300] * 7, index=["a", "beautiful", "day", "is", "not", "that", "today"], ) s = preprocessing.tokenize(s) df_embedding = representation.word2vec(s, min_count=1, seed=1) self.assertEqual(type(df_embedding), pd.DataFrame) self.assertEqual(df_embedding.shape, df_true.shape)
def test_tfidf_formula(self): s = pd.Series(["Hi Bye", "Test Bye Bye"]) s = preprocessing.tokenize(s) s_true = pd.Series([ [ 1.0 * (math.log(3 / 3) + 1), 1.0 * (math.log(3 / 2) + 1), 0.0 * (math.log(3 / 2) + 1), ], [ 2.0 * (math.log(3 / 3) + 1), 0.0 * (math.log(3 / 2) + 1), 1.0 * (math.log(3 / 2) + 1), ], ]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def test_tfidf_single_document(self): s = pd.Series("a", index=["yo"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1]], index=["yo"]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def test_term_frequency_not_lowercase(self): s = pd.Series(["one ONE"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1]]) self.assertEqual(representation.term_frequency(s), s_true)
def test_term_frequency_punctuation_are_kept(self): s = pd.Series(["one !"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1]]) self.assertEqual(representation.term_frequency(s), s_true)
def test_tokenize_split_punctuation(self): s = pd.Series(["ready. set, go!"]) s_true = pd.Series([["ready", ".", "set", ",", "go", "!"]]) self.assertEqual(preprocessing.tokenize(s), s_true)
def test_count_not_lowercase(self): s = pd.Series(["one ONE"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1]]) self.assertEqual(representation.count(s), s_true)
def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, return_feature_names=False, ) -> pd.Series: """ Represent a text-based Pandas Series using term frequency. The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default to None. Maximum number of features to keep. Will keep all features if set to None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. return_features_names : Boolean, False by Default If True, return a tuple (*count_series*, *features_names*) Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) 0 [0.25, 0.25, 0.0] 1 [0.25, 0.0, 0.25] dtype: object To return the features_names: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s, return_feature_names=True) (0 [0.25, 0.25, 0.0] 1 [0.25, 0.0, 0.25] dtype: object, ['Sentence', 'one', 'two']) """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tf = CountVectorizer( max_features=max_features, tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=min_df, max_df=max_df, ) cv_fit_transform = tf.fit_transform(s).toarray() total_count = np.sum(cv_fit_transform) s = pd.Series(np.divide(cv_fit_transform, total_count).tolist(), index=s.index) if return_feature_names: return (s, tf.get_feature_names()) else: return s
def test_tokenize(self): s = pd.Series("text to tokenize") s_true = pd.Series([["text", "to", "tokenize"]]) self.assertEqual(preprocessing.tokenize(s), s_true)
["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text, )], ["remove_brackets", preprocessing.remove_brackets, (s_text, )], ["remove_html_tags", preprocessing.remove_html_tags, (s_text, )], ["tokenize", preprocessing.tokenize, (s_text, )], ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text, )], ["replace_urls", preprocessing.replace_urls, (s_text, "")], ["remove_urls", preprocessing.remove_urls, (s_text, )], ["replace_tags", preprocessing.replace_tags, (s_text, "")], ["remove_tags", preprocessing.remove_tags, (s_text, )], ] test_cases_representation = [ [ "term_frequency", representation.term_frequency, (preprocessing.tokenize(s_text), ), ], ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text), )], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists, )], ["tsne", representation.tsne, (s_numeric_lists, )], ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists, )], ["meanshift", representation.meanshift, (s_numeric_lists, )], ] test_cases_visualization = [] test_cases = (test_cases_nlp + test_cases_preprocessing + test_cases_representation + test_cases_visualization)
def count( s: Union[TokenSeries, TextSeries], max_features: Optional[int] = None, min_df=1, max_df=1.0, binary=False, ) -> pd.DataFrame: """ Represent a text-based Pandas Series using count. Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the number of the term in this document. The output is sparse. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before count is calculated. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default=None Maximum number of features to keep. Will keep all features if set to None. min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, optional, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. binary : bool, optional, default=False If True, all non zero counts are set to 1. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize) >>> hero.count(s) # doctest: +SKIP Sentence one two 0 1 1 0 1 1 0 1 See Also -------- TODO add tutorial link """ # TODO. Can be rewritten without sklearn. # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tf = CountVectorizer( max_features=max_features, tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=min_df, max_df=max_df, binary=binary, ) tf_vectors_csr = tf.fit_transform(s) return pd.DataFrame.sparse.from_spmatrix(tf_vectors_csr, s.index, tf.get_feature_names())
def test_idf_single_document(self): s = pd.Series("a") s = preprocessing.tokenize(s) s_true = pd.Series([[1]]) self.assertEqual(representation.tfidf(s), s_true)
def word2vec( s: pd.Series, size=300, algorithm: str = "cbow", num_epochs: int = 30, min_count: int = 5, window_size: int = 5, alpha: float = 0.03, max_vocab_size: int = None, downsample_freq: float = 0.0001, min_alpha: float = 0.0001, negative_samples: int = 5, workers: int = None, seed: int = None, ): """Perform Word2vec on the given Pandas Series Return a Pandas Dataframe of shape (vocabulary_size, vectors_size). Word2vec is a two-layer neural network used to map each word to its vector representation. In general, its input is a text corpus and its output is a set of vectors: feature vectors that represent words in that corpus. In this specific case, the input is a Pandas Series containing in each cell a tokenized text and the output is a Pandas DataFrame where indexes are words and columns are the vector dimensions. Under the hoods, this function makes use of Gensim Word2Vec module. The input Series should already be tokenized. If not, it will be tokenized before word2vec is applied. Parameters ---------- s : Pandas Series size : int, optional, default is 300 Size of the returned vector. A good values is anything between 100-300. For very large dataset, a smaller values requires less training time. algorithm : str, optional, default is "cbow". The training algorithm. It can be either "skipgram" or "cbow". With CBOW (continuous bag-of-words) the model predicts the current word from a window of surrounding context words. In the continuous skip-gram mode, the model uses the current word to predict the surrounding window of context words. According to the authors, CBOW is faster while skip-gram is slower but does a better job for infrequent words. num_epochs : int, optional, default is 30 Number of epochs to train the model. min_count : int, optional, default is 5 Keep only words with a frequency equal or higher than min_count. window_size : int, optional, default is 5 Surrounding window size of context words. alpha : float, optional, default is 0.03 Initial learning rate max_vocab_size : int, optional, default to None Maximum number of words to keep. This corresponds to the length of the returned DataFrame. downsample_freq : float, optional, default to 0.0001 (10^-4) Threshold frequency to downsample very frequent words. The results is similar to remove stop-words. The random removal of tokens is executed before word2vec is executed, reducing the distance between words. min_alpha : float, default to 0.0001 (10^-4) The learning rate will drop linearly to min_alpha during training. negative_samples : int, optional, 5 by default Number of negative samples to use. Negative sampling addresses the problem of avoding updating all weights at each epoch. It does so by selecting and modifing during each epoch only a small percentage of the total weights. The authors of the paper suggests to set negative sampling to 5-20 words for smaller datasets, and 2-5 words for large datasets. workers : int, optional, None by default. For improved performance, by default use all available computer workers. When set, use the same number of cpu. seed : int, optional, None by default. Seed for the random number generator. All vectors are initialized randomly using an hash function formed by the concatenation of the word itself and str(seed). Important: for a fully deterministically-reproducible run, you must set the model to run on a single worker thread (workers=1). See Also -------- `Word2Vec Tutorial - The Skip-Gram Model <http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/>`_ and `Word2Vec Tutorial Part 2 - Negative Sampling <http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/>`_ for two great tutorial on Word2Vec """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): print(_not_tokenized_warning_message) s = preprocessing.tokenize(s) if algorithm == "cbow": sg = 0 elif algorithm == "skipgram": sg = 1 else: raise ValueError("algorithm must be either 'cbow' or 'skipgram'") w2v_model = Word2Vec( size=size, min_count=min_count, window=window_size, alpha=alpha, max_vocab_size=max_vocab_size, sample=downsample_freq, seed=seed, min_alpha=min_alpha, negative=negative_samples, sg=sg, ) w2v_model.build_vocab(s.values, progress_per=10000) if len(w2v_model.wv.vocab.keys()) == 0: print("Vocabulary ...") w2v_model.train( s.values, total_examples=w2v_model.corpus_count, epochs=num_epochs, report_delay=1, ) all_vocabulary = sorted(list(set(w2v_model.wv.vocab.keys()))) return pd.DataFrame(data=w2v_model.wv[all_vocabulary], index=all_vocabulary)
def tfidf( s: pd.Series, max_features=None, min_df=1, max_df=1.0, ) -> pd.Series: """ Represent a text-based Pandas Series using TF-IDF. *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to calculate the _relative importance_ of the words in a document, taking into account the words' occurences in other documents. It consists of two parts: The *term frequency (tf)* tells us how frequently a term is present in a document, so tf(document d, term t) = number of times t appears in d. The *inverse document frequency (idf)* measures how _important_ or _characteristic_ a term is among the whole corpus (i.e. among all documents). Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1. Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_ extraction.text.TfidfVectorizer.html>`, this function does *not* normalize the output in any way, so the result is exactly what you get applying the formula described above. Return a Document Representation Series with the tfidf of every word in the document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. If working with big pandas Series, you might want to limit the number of features through the max_features parameter. Use :meth:`hero.representation.flatten` on the output to get a standard Pandas Series with the document vectors in every cell. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default to None. If not None, only the max_features most frequent tokens are used. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. If float, the parameter represents a proportion of documents, integer absolute counts. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) 0 Bye 1.000000 Hi 1.405465 1 Bye 2.000000 Test 1.405465 dtype: Sparse[float64, nan] See Also -------- `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_ Document Representation Series: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tfidf = TfidfVectorizer( use_idf=True, max_features=max_features, min_df=min_df, max_df=max_df, tokenizer=lambda x: x, preprocessor=lambda x: x, norm=None, # Disable l1/l2 normalization. ) tfidf_vectors_csr = tfidf.fit_transform(s) # Result from sklearn is in Compressed Sparse Row format. # Pandas Sparse Series can only be initialized from Coordinate format. tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) # Map word index to word name and keep original index of documents. feature_names = tfidf.get_feature_names() s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) return s_out
def term_frequency( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, ) -> pd.Series: """ Represent a text-based Pandas Series using term frequency. Return a Document Representation Series with the term frequencies of the terms for every document. TODO add tutorial link The input Series should already be tokenized. If not, it will be tokenized before term_frequency is calculated. Use :meth:`hero.representation.flatten` on the output to get a standard Pandas Series with the document vectors in every cell. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default to None. Maximum number of features to keep. Will keep all features if set to None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize) >>> hero.term_frequency(s) 0 Sentence 0.2 hey 0.2 one 0.2 1 Sentence 0.2 two 0.2 dtype: Sparse[float64, nan] See Also -------- Document Representation Series: TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tf = CountVectorizer( max_features=max_features, tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=min_df, max_df=max_df, ) tf_vectors_csr = tf.fit_transform(s) tf_vectors_coo = coo_matrix(tf_vectors_csr) total_count_coo = np.sum(tf_vectors_coo) frequency_coo = np.divide(tf_vectors_coo, total_count_coo) s_out = pd.Series.sparse.from_coo(frequency_coo) features_names = tf.get_feature_names() # Map word index to word name s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]])) return s_out
def tfidf(s: pd.Series, max_features=None, min_df=1, return_feature_names=False): """ Represent a text-based Pandas Series using TF-IDF. The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. Parameters ---------- s : Pandas Series max_features : int, optional Maximum number of features to keep. min_df : int, optional. Default to 1. When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. return_features_names : Boolean. Default to False. If True, return a tuple (*tfidf_series*, *features_names*) Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.tfidf(s) 0 [0.5797386715376657, 0.8148024746671689, 0.0] 1 [0.5797386715376657, 0.0, 0.8148024746671689] dtype: object To return the *feature_names*: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.tfidf(s, return_feature_names=True) (0 [0.5797386715376657, 0.8148024746671689, 0.0] 1 [0.5797386715376657, 0.0, 0.8148024746671689] dtype: object, ['Sentence', 'one', 'two']) """ # TODO. In docstring show formula to compute TF-IDF and also avoid using sk-learn if possible. # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): print(_not_tokenized_warning_message) s = preprocessing.tokenize(s) tfidf = TfidfVectorizer( use_idf=True, max_features=max_features, min_df=min_df, lowercase=False, tokenizer=lambda x: x, preprocessor=lambda x: x, ) s = pd.Series(tfidf.fit_transform(s).toarray().tolist(), index=s.index) if return_feature_names: return (s, tfidf.get_feature_names()) else: return s
def tfidf( s: pd.Series, max_features=None, min_df=1, max_df=1.0, ) -> pd.DataFrame: """ Represent a text-based Pandas Series using TF-IDF. Rows of the returned DataFrame represent documents whereas columns are terms. The value in the cell document-term is the tfidf-value of the term in this document. The output is sparse. *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to calculate the _relative importance_ of the words in a document, taking into account the words' occurences in other documents. It consists of two parts: The *term frequency (tf)* tells us how frequently a term is present in a document, so tf(document d, term t) = number of times t appears in d. The *inverse document frequency (idf)* measures how _important_ or _characteristic_ a term is among the whole corpus (i.e. among all documents). Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1. Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_ extraction.text.TfidfVectorizer.html>`, this function does *not* normalize the output in any way, so the result is exactly what you get applying the formula described above. The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default=None If not None, only the max_features most frequent tokens are used. min_df : float in range [0.0, 1.0] or int, optional, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. If float, the parameter represents a proportion of documents, integer absolute counts. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s) # doctest: +SKIP Bye Hi Test 0 1.0 1.405465 0.000000 1 2.0 0.000000 1.405465 See Also -------- `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_ TODO add tutorial link """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tfidf = TfidfVectorizer( use_idf=True, max_features=max_features, min_df=min_df, max_df=max_df, tokenizer=lambda x: x, preprocessor=lambda x: x, norm=None, # Disable l1/l2 normalization. ) tfidf_vectors_csr = tfidf.fit_transform(s) return pd.DataFrame.sparse.from_spmatrix(tfidf_vectors_csr, s.index, tfidf.get_feature_names())
def test_tfidf_single_not_lowercase(self): s = pd.Series("ONE one") s = preprocessing.tokenize(s) s_true = pd.Series([[1.0, 1.0]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def count( s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0, return_feature_names=False, ): """ Represent a text-based Pandas Series using count. The input Series should already be tokenized. If not, it will be tokenized before count is calculated. Parameters ---------- s : Pandas Series max_features : int, optional Maximum number of features to keep. min_df : int, optional, default to 1. When building the vocabulary, ignore terms that have a document frequency (number of documents a term appears in) strictly lower than the given threshold. max_df : int or double, optional, default to 1.0 When building the vocabulary, ignore terms that have a document frequency (number of documents a term appears in) strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. When the argument is a float [0.0, 1.0], the parameter represents a proportion of documents. return_features_names : Boolean, False by Default If True, return a tuple (*count_series*, *features_names*) Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.count(s) 0 [1, 1, 0] 1 [1, 0, 1] dtype: object To return the features_names: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Sentence one", "Sentence two"]) >>> s = hero.tokenize(s) >>> hero.count(s, return_feature_names=True) (0 [1, 1, 0] 1 [1, 0, 1] dtype: object, ['Sentence', 'one', 'two']) """ # TODO. Can be rewritten without sklearn. # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tf = CountVectorizer( max_features=max_features, tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=min_df, max_df=max_df, ) s = pd.Series(tf.fit_transform(s).toarray().tolist(), index=s.index) if return_feature_names: return (s, tf.get_feature_names()) else: return s
def test_tfidf_max_features(self): s = pd.Series("one one two") s = preprocessing.tokenize(s) s_true = pd.Series([[2.0]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s, max_features=1), s_true)
def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0, return_feature_names=False) -> pd.Series.sparse: """ Represent a text-based Pandas Series using TF-IDF. *Term Frequency - Inverse Document Frequency (TF-IDF)* is a formula to calculate the _relative importance_ of the words in a document, taking into account the words' occurences in other documents. It consists of two parts: The *term frequency (tf)* tells us how frequently a term is present in a document, so tf(document d, term t) = number of times t appears in d. The *inverse document frequency (idf)* measures how _important_ or _characteristic_ a term is among the whole corpus (i.e. among all documents). Thus, idf(term t) = log((1 + number of documents) / (1 + number of documents where t is present)) + 1. Finally, tf-idf(document d, term t) = tf(d, t) * idf(t). Different from the `sklearn-implementation of tfidf <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`, this function does *not* normalize the output in any way, so the result is exactly what you get applying the formula described above. The input Series should already be tokenized. If not, it will be tokenized before tfidf is calculated. If working with big pandas Series, you might want to limit the number of features through the max_features parameter. Parameters ---------- s : Pandas Series (tokenized) max_features : int, optional, default to None. Maximum number of features to keep. Will keep all features if set to None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency (number of documents they appear in) strictly lower than the given threshold. If float, the parameter represents a proportion of documents, integer absolute counts. max_df : float in range [0.0, 1.0] or int, default=1.0 Ignore terms that have a document frequency (number of documents they appear in) frequency strictly higher than the given threshold. This arguments basically permits to remove corpus-specific stop words. If float, the parameter represents a proportion of documents, integer absolute counts. return_features_names : Boolean, False by Default If True, return a tuple (*count_series*, *features_names*) Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize) >>> hero.tfidf(s, return_feature_names=True) (document 0 [1.0, 1.4054651081081644, 0.0] 1 [2.0, 0.0, 1.4054651081081644] dtype: object, ['Bye', 'Hi', 'Test']) See Also -------- `TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_ """ # Check if input is tokenized. Else, print warning and tokenize. if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = preprocessing.tokenize(s) tfidf = TfidfVectorizer( use_idf=True, max_features=max_features, min_df=min_df, max_df=max_df, tokenizer=lambda x: x, preprocessor=lambda x: x, norm=None, # Disable l1/l2 normalization. ) tfidf_vectors_csr = tfidf.fit_transform(s) # Result from sklearn is in Compressed Sparse Row format. # Pandas Sparse Series can only be initialized from Coordinate format. tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr) s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo) # Map word index to word name and keep original index of documents. feature_names = tfidf.get_feature_names() s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]])) s_out.rename_axis(["document", "word"], inplace=True) # NOTE: Currently: still convert to flat series instead of representation series. # Will change to return representation series directly in Version 2. s_out = representation_series_to_flat_series(s_out, fill_missing_with=0.0, index=s.index) if return_feature_names: return s_out, feature_names else: return s_out
def test_term_frequency_single_document(self): s = pd.Series("a b c c") s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1, 2]]) self.assertEqual(representation.term_frequency(s), s_true)
def test_tokenize_multirows(self): s = pd.Series(["first row", "second row"]) s_true = pd.Series([["first", "row"], ["second", "row"]]) self.assertEqual(preprocessing.tokenize(s), s_true)
def test_term_frequency_multiple_documents(self): s = pd.Series(["doc_one", "doc_two"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1, 1, 0], [1, 1, 0, 1]]) self.assertEqual(representation.term_frequency(s), s_true)
def test_tokenize_not_split_in_between_punctuation(self): s = pd.Series(["don't say hello-world"]) s_true = pd.Series([["don't", "say", "hello-world"]]) self.assertEqual(preprocessing.tokenize(s), s_true)
def test_count_multiple_documents(self): s = pd.Series(["doc_one", "doc_two"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 0], [0, 1]]) self.assertEqual(representation.count(s), s_true)