def phrases(s: TokenSeries, min_count: int = 5, threshold: int = 10, symbol: str = "_") -> TokenSeries: r"""Group up collocations words Given a pandas Series of tokenized strings, group together bigrams where each tokens has at least `min_count` term frequency and where the `threshold` is larger than the underline formula. :math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab } { (word\_a\_count * word\_b\_count)}`. Parameters ---------- s : :class:`texthero._types.TokenSeries` min_count : int, optional, default=5 Ignore tokens with frequency less than this. threshold : int, optional, default=10 Ignore tokens with a score under that threshold. symbol : str, optional, default="_" Character used to join collocation words. Examples -------- >>> import texthero as hero >>> s = pd.Series([['New', 'York', 'is', 'a', 'beautiful', 'city'], ... ['Look', ':', 'New', 'York', '!']]) >>> hero.phrases(s, min_count=1, threshold=1) 0 [New_York, is, a, beautiful, city] 1 [Look, :, New_York, !] dtype: object Reference -------- `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_ """ if not isinstance(s.iloc[0], list): warnings.warn(_not_tokenized_warning_message, DeprecationWarning) s = tokenize(s) delimiter = symbol.encode("utf-8") phrases = PhrasesTransformer(min_count=min_count, threshold=threshold, delimiter=delimiter) return pd.Series(phrases.fit_transform(s.values), index=s.index)
def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10) -> pd.Series: r"""Tokenize and group up collocations words Tokenize the given pandas Series and group up bigrams where each token has at least min_count term frequrncy and where the threshold is larger than the underline formula. :math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab }{ (word\_a\_count * word\_b\_count)}`. Parameters ---------- s : Pandas Series min_count : Int, optional. Default is 5. ignore tokens with frequency less than this threshold : Int, optional. Default is 10. ignore tokens with a score under that threshold Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["New York is a beautiful city", "Look: New York!"]) >>> hero.tokenize_with_phrases(s, min_count=1, threshold=1) 0 [New_York, is, a, beautiful, city] 1 [Look, :, New_York, !] dtype: object Reference -------- `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_ """ if type(s.iloc[0]) != str: raise ValueError("Input series should be a list of string.") s = tokenize(s) phrases = PhrasesTransformer(min_count=min_count, threshold=threshold) return pd.Series(phrases.fit_transform(s.values), index=s.index)
# sort to view the most common terms # the key (lambda x: x[1]) sorts by the count sorted(window_count.items(), key=lambda x: x[1], reverse=True)[0:20] # ### Phrase (collocation) Detection # # Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is: # # ##### Scikit-learn API for Gensim from gensim.sklearn_api.phrases import PhrasesTransformer sklearn_phrases = PhrasesTransformer(min_count=3, threshold=3) sklearn_phrases.fit(matched_sents) #sklearn_phrases.transform(matched_sents) print(matched_sents) # review phrase matches phrases = [] for terms in sklearn_phrases.transform(matched_sents): for term in terms: if term.count('_') >= 2: phrases.append(term) print(set(phrases)) # create a list of stop words
def extract_bigrams(self, X, y=None): return PhrasesTransformer(**self.phrases_kwargs).fit_transform(X)
def transform(self, X, y=None): return [list(filter(lambda x: '_' in x, ls)) for ls in PhrasesTransformer(**self.phrases_kwargs).fit_transform(X)]