Example #1
0
def extract_significant_terms_from_subset(data_frame: pandas.DataFrame, subset_data_frame: pandas.DataFrame,
                                          field_name: str,
                                          vectorizer: CountVectorizer = CountVectorizer(encoding="latin1",
                                                                                        lowercase=True,
                                                                                        max_features=500)) -> pandas.Series:
    """
    Returns interesting or unusual occurrences of terms in a subset.

    Based on the `elasticsearch significant_text aggregation
    <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted>`_

    :param data_frame: the full data set.
    :param subset_data_frame: the subset partition data, with over it the scoring will be calculated. Can a filter by
           feature or other boolean criteria.
    :param field_name: the feature to parse.
    :param vectorizer: text count vectorizer which converts collection of text to a matrix of token counts. See more
                       info `here <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ .
    :return: Series of terms with scoring over the subset.

    :author: `Eran Hirsch <https://github.com/eranhirs>`_
    """
    count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna())
    matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    subset_X = vectorizer.transform(subset_data_frame[field_name].dropna())
    subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names_out())

    subset_freq = subset_matrix_df.sum()
    superset_freq = matrix_df.sum()

    return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)
Example #2
0
 def get_feature_names_out(self, n_labels=3, prefix=''):
     """
     Returns the labels that best summarize the learned components/topics.
     For each topic, labels with highest activations are selected.
     
     Parameters
     ----------
     
     n_labels : int, default=3
         The number of labels used to describe each topic.
     
     Returns
     -------
     
     topic_labels : list of strings
         The labels that best describe each topic.
     
     """
     vectorizer = CountVectorizer()
     vectorizer.fit(list(self.H_dict_.keys()))
     if LooseVersion(sklearn_version) < LooseVersion('1.0'):
         vocabulary = np.array(vectorizer.get_feature_names())
     else:
         vocabulary = np.array(vectorizer.get_feature_names_out())
     encoding = self.transform(np.array(vocabulary).reshape(-1))
     encoding = abs(encoding)
     encoding = encoding / np.sum(encoding, axis=1, keepdims=True)
     n_components = encoding.shape[1]
     topic_labels = []
     for i in range(n_components):
         x = encoding[:, i]
         labels = vocabulary[np.argsort(-x)[:n_labels]]
         topic_labels.append(labels)
     topic_labels = [prefix + ', '.join(label) for label in topic_labels]
     return topic_labels
Example #3
0
class CountVectorFeatureExtractor(BaseTransformer):
    def __init__(self, col):
        self.vec_count = CountVectorizer()
        self.col = col

    def fit(self, df):
        self.vec_count.fit(df[self.col].values)

    def transform(self, df):
        X = self.vec_count.transform(df[self.col].values)
        X = pd.DataFrame(X.toarray(),
                         columns=self.vec_count.get_feature_names_out())
        return X

    def fit_transform(self, df, y=None, **fit_params):
        self.vec_count.fit(df[self.col].values)
        X = self.vec_count.transform(df[self.col].values)
        X = pd.DataFrame(X.toarray(),
                         columns=self.vec_count.get_feature_names_out())
        return X
Example #4
0
def append_tags_to_frame(X_train: pandas.DataFrame, X_test: pandas.DataFrame, field_name: str,
                         prefix: Optional[str] = "", max_features: Optional[int] = 500, min_df: Union[int, float] = 1,
                         lowercase=False, tokenizer: Optional[Callable[[str], List[str]]] = _tokenize) -> Tuple[
    pandas.DataFrame, pandas.DataFrame]:
    """
    Extracts tags from a given field and append them as dataframe.

    :param X_train: Pandas' dataframe with the train features.
    :param X_test: Pandas' dataframe with the test features.
    :param field_name: the feature to parse.
    :param prefix: the given prefix for new tag feature.
    :param max_features: int or None, default=500.
           max tags names to consider.
    :param min_df: float in range [0.0, 1.0] or int, default=1.
           When building the tag name set ignore tags that have a document frequency strictly higher than the given
           threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents,
           integer absolute counts.
    :param lowercase: boolean, default=False.
           Convert all characters to lowercase before tokenizing the tag names.
    :param tokenizer: callable or None.
           Override the string tokenization step while preserving the preprocessing and n-grams generation steps.
           Default splits by ",", and retain alphanumeric characters with special characters "_", "$" and "-".
    :return: the train and test with tags appended.
    """
    vectorizer = CountVectorizer(binary=True, tokenizer=tokenizer, encoding="latin1", lowercase=lowercase,
                                 min_df=min_df, max_features=max_features)
    x_train_count_matrix = vectorizer.fit_transform(X_train[field_name].dropna())
    x_train_tags = pandas.DataFrame(x_train_count_matrix.toarray(),
                                    columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
    x_train_tags.index = X_train.index

    x_test_count_matrix = vectorizer.transform(X_test[field_name].dropna())
    x_test_tags = pandas.DataFrame(x_test_count_matrix.toarray(),
                                   columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()])
    x_test_tags.index = X_test.index

    x_train_reduced = X_train.drop(columns=[field_name])
    x_test_reduced = X_test.drop(columns=[field_name])

    return pandas.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"), pandas.merge(
        x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left")
def _hypergeom_clusters(
    cluster_labels: np.ndarray, keywords: List[List[str]],
    fdr_threshold: float, n_words: int
) -> Tuple[Dict[int, List[str]], np.ndarray, np.ndarray, np.ndarray]:
    keywords = [[w for w, _ in doc_keywords] for doc_keywords in keywords]

    clusters_keywords = {}
    for label in sorted(set(cluster_labels) - {-1}):
        indices = set(np.flatnonzero(cluster_labels == label))
        kwds = [k for i, k in enumerate(keywords) if i in indices]
        clusters_keywords[label] = kwds

    cv = CountVectorizer(tokenizer=lambda w: w, preprocessor=lambda w: w)
    X = cv.fit_transform(list(chain.from_iterable(clusters_keywords.values())))
    all_keywords = np.array(cv.get_feature_names_out())

    index = 0
    selected_clusters_keywords = {}
    all_scores, all_p_values = [], []
    for label, cls_kwds in clusters_keywords.items():
        # find words that should be specific for a group with hypergeom test
        n_docs = len(cls_kwds)
        p_values = hypergeom_p_values(X, X[index:index + n_docs])
        words = set(all_keywords[np.array(p_values) < fdr_threshold])

        # select only words with p-values less than threshold
        sel_words = [w for w in chain.from_iterable(cls_kwds)]
        sel_words = [w for w in sel_words if w in words]
        sel_words = [(w, c / n_docs)
                     for w, c in Counter(sel_words).most_common(n_words)]
        selected_clusters_keywords[label] = sel_words

        all_scores.append(X[index:index + n_docs].sum(axis=0) / n_docs)
        all_p_values.append(p_values)

        index += n_docs

    all_scores = np.vstack(all_scores)
    all_p_values = np.vstack(all_p_values)
    return selected_clusters_keywords, all_keywords, all_scores, all_p_values
Example #6
0
''' Text Feature Extraction '''
# Using CountVectorizer

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

text = [
    "In Jigsaw's fourth Kaggle competition",
    "When we ask human judges to look at", "to decide which ones are toxic and"
]

counterVec = CountVectorizer()
counterVec.fit(text)
print("Get Feature Names")
print(counterVec.get_feature_names_out())
print("The number of feature is {}".format(
    len(counterVec.get_feature_names_out())))

extracted_features = counterVec.transform(text)
print(extracted_features.toarray().shape)
print(extracted_features.toarray())
    doc = doc.lower()
    doc_cleaned = ' '.join(
        lemmatizer.lemmatize(word) for word in doc.split()
        if word.isalpha() and word not in all_names)
    data_cleaned.append(doc_cleaned)

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english",
                               max_features=None,
                               max_df=0.5,
                               min_df=2)

data = count_vector.fit_transform(data_cleaned)

from sklearn.decomposition import LatentDirichletAllocation

t = 20
lda = LatentDirichletAllocation(n_components=t,
                                learning_method='batch',
                                random_state=42)

lda.fit(data)

print(lda.components_)

terms = count_vector.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic {}:".format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
Example #8
0
 def _ngrams_split(self,text):
     
     vectorizer = CountVectorizer(ngram_range=self.ngram_range)
     _ = vectorizer.fit_transform([text])
     
     return list(vectorizer.get_feature_names_out())
        pickle.dump((result, names), fd)
    pass


os.makedirs(sys.argv[2], exist_ok=True)

# Generate train feature matrix
df_train = get_df(train_input)
train_words = np.array(df_train.text.str.lower().values.astype("U"))

bag_of_words = CountVectorizer(
    stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
)

bag_of_words.fit(train_words)
train_words_binary_matrix = bag_of_words.transform(train_words)
feature_names = bag_of_words.get_feature_names_out()
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(train_words_binary_matrix)
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)

save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)

# Generate test feature matrix
df_test = get_df(test_input)
test_words = np.array(df_test.text.str.lower().values.astype("U"))
test_words_binary_matrix = bag_of_words.transform(test_words)
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)

save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
Example #10
0
    (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components,
          random_state=1,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
          alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(
    nmf, tfidf_feature_names, n_top_words,
    'Topics in NMF model (generalized Kullback-Leibler divergence)')

print(
    '\n' * 2, "Fitting LDA models with tf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
Example #11
0
        1,
        len(abstracts),
        value=3)
    st.markdown("### NMF clusters")
    H1, W1 = run_model(vectors, "nmf", n_clusters)
    r = show_topics(H1, num_top_words)
    r
    st.markdown("Membership:")
    for i in range(n_clusters):
        m = get_members_for_cluster(W1, i)
        st.write(f"Cluster {i}: {m}")
    add_abstract_viewer(abstracts, 2)

    st.markdown("### TF-IDF Clusters")
    H1, W1 = run_model(vectors, "tfidf", n_clusters)
    r = show_topics(H1, num_top_words)
    r
    st.markdown("Membership:")
    for i in range(n_clusters):
        m = get_members_for_cluster(W1, i)
        st.write(f"Cluster {i}: {m}")
    add_abstract_viewer(abstracts, 3)


abstracts = get_data()
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(abstracts).todense()
vocab = np.array(vectorizer.get_feature_names_out())

main(abstracts=abstracts, vectors=vectors)
Example #12
0
class GapEncoderColumn(BaseEstimator, TransformerMixin):
    """See GapEncoder's docstring."""
    def __init__(self,
                 n_components=10,
                 batch_size=128,
                 gamma_shape_prior=1.1,
                 gamma_scale_prior=1.0,
                 rho=.95,
                 rescale_rho=False,
                 hashing=False,
                 hashing_n_features=2**12,
                 init='k-means++',
                 tol=1e-4,
                 min_iter=2,
                 max_iter=5,
                 ngram_range=(2, 4),
                 analyzer='char',
                 add_words=False,
                 random_state=None,
                 rescale_W=True,
                 max_iter_e_step=20):

        self.ngram_range = ngram_range
        self.n_components = n_components
        self.gamma_shape_prior = gamma_shape_prior  # 'a' parameter
        self.gamma_scale_prior = gamma_scale_prior  # 'b' parameter
        self.rho = rho
        self.rho_ = self.rho
        self.rescale_rho = rescale_rho
        self.batch_size = batch_size
        self.tol = tol
        self.hashing = hashing
        self.hashing_n_features = hashing_n_features
        self.max_iter = max_iter
        self.min_iter = min_iter
        self.init = init
        self.analyzer = analyzer
        self.add_words = add_words
        self.random_state = check_random_state(random_state)
        self.rescale_W = rescale_W
        self.max_iter_e_step = max_iter_e_step

    def _init_vars(self, X):
        """
        Build the bag-of-n-grams representation V of X and initialize
        the topics W.
        """
        # Init n-grams counts vectorizer
        if self.hashing:
            self.ngrams_count_ = HashingVectorizer(
                analyzer=self.analyzer,
                ngram_range=self.ngram_range,
                n_features=self.hashing_n_features,
                norm=None,
                alternate_sign=False)
            if self.add_words:  # Init a word counts vectorizer if needed
                self.word_count_ = HashingVectorizer(
                    analyzer='word',
                    n_features=self.hashing_n_features,
                    norm=None,
                    alternate_sign=False)
        else:
            self.ngrams_count_ = CountVectorizer(analyzer=self.analyzer,
                                                 ngram_range=self.ngram_range,
                                                 dtype=np.float64)
            if self.add_words:
                self.word_count_ = CountVectorizer(dtype=np.float64)

        # Init H_dict_ with empty dict to train from scratch
        self.H_dict_ = dict()
        # Build the n-grams counts matrix unq_V on unique elements of X
        unq_X, lookup = np.unique(X, return_inverse=True)
        unq_V = self.ngrams_count_.fit_transform(unq_X)
        if self.add_words:  # Add word counts to unq_V
            unq_V2 = self.word_count_.fit_transform(unq_X)
            unq_V = sparse.hstack((unq_V, unq_V2), format='csr')

        if not self.hashing:  # Build n-grams/word vocabulary
            if LooseVersion(sklearn_version) < LooseVersion('1.0'):
                self.vocabulary = self.ngrams_count_.get_feature_names()
            else:
                self.vocabulary = self.ngrams_count_.get_feature_names_out()
            if self.add_words:
                if LooseVersion(sklearn_version) < LooseVersion('1.0'):
                    self.vocabulary = np.concatenate(
                        (self.vocabulary,
                         self.word_count_.get_feature_names()))
                else:
                    self.vocabulary = np.concatenate(
                        (self.vocabulary,
                         self.word_count_.get_feature_names_out()))
        _, self.n_vocab = unq_V.shape
        # Init the topics W given the n-grams counts V
        self.W_, self.A_, self.B_ = self._init_w(unq_V[lookup], X)
        # Init the activations unq_H of each unique input string
        unq_H = _rescale_h(unq_V, np.ones((len(unq_X), self.n_components)))
        # Update self.H_dict_ with unique input strings and their activations
        self.H_dict_.update(zip(unq_X, unq_H))
        if self.rescale_rho:
            # Make update rate per iteration independant of the batch_size
            self.rho_ = self.rho**(self.batch_size / len(X))
        return unq_X, unq_V, lookup

    def _get_H(self, X):
        """
        Return the bag-of-n-grams representation of X.
        """
        H_out = np.empty((len(X), self.n_components))
        for x, h_out in zip(X, H_out):
            h_out[:] = self.H_dict_[x]
        return H_out

    def _init_w(self, V, X):
        """
        Initialize the topics W.
        If self.init='k-means++', we use the init method of
        sklearn.cluster.KMeans.
        If self.init='random', topics are initialized with a Gamma
        distribution.
        If self.init='k-means', topics are initialized with a KMeans on the
        n-grams counts.
        """
        if self.init == 'k-means++':
            if LooseVersion(sklearn_version) < LooseVersion('0.24'):
                W = _k_init(V,
                            self.n_components,
                            x_squared_norms=row_norms(V, squared=True),
                            random_state=self.random_state,
                            n_local_trials=None) + .1
            else:
                W, _ = kmeans_plusplus(V,
                                       self.n_components,
                                       x_squared_norms=row_norms(V,
                                                                 squared=True),
                                       random_state=self.random_state,
                                       n_local_trials=None)
                W = W + .1  # To avoid restricting topics to few n-grams only
        elif self.init == 'random':
            W = self.random_state.gamma(shape=self.gamma_shape_prior,
                                        scale=self.gamma_scale_prior,
                                        size=(self.n_components, self.n_vocab))
        elif self.init == 'k-means':
            prototypes = get_kmeans_prototypes(X,
                                               self.n_components,
                                               analyzer=self.analyzer,
                                               random_state=self.random_state)
            W = self.ngrams_count_.transform(prototypes).A + .1
            if self.add_words:
                W2 = self.word_count_.transform(prototypes).A + .1
                W = np.hstack((W, W2))
            # if k-means doesn't find the exact number of prototypes
            if W.shape[0] < self.n_components:
                if LooseVersion(sklearn_version) < LooseVersion('0.24'):
                    W2 = _k_init(V,
                                 self.n_components - W.shape[0],
                                 x_squared_norms=row_norms(V, squared=True),
                                 random_state=self.random_state,
                                 n_local_trials=None) + .1
                else:
                    W2, _ = kmeans_plusplus(V,
                                            self.n_components - W.shape[0],
                                            x_squared_norms=row_norms(
                                                V, squared=True),
                                            random_state=self.random_state,
                                            n_local_trials=None)
                    W2 = W2 + .1
                W = np.concatenate((W, W2), axis=0)
        else:
            raise AttributeError('Initialization method %s does not exist.' %
                                 self.init)
        W /= W.sum(axis=1, keepdims=True)
        A = np.ones((self.n_components, self.n_vocab)) * 1e-10
        B = A.copy()
        return W, A, B

    def fit(self, X, y=None):
        """
        Fit the GapEncoder on batches of X.

        Parameters
        ----------
        X : array-like, shape (n_samples, )
            The string data to fit the model on.
        
        Returns
        -------
        self
        """
        # Check if first item has str or np.str_ type
        assert isinstance(X[0], str), "ERROR: Input data is not string."
        # Make n-grams counts matrix unq_V
        unq_X, unq_V, lookup = self._init_vars(X)
        n_batch = (len(X) - 1) // self.batch_size + 1
        del X
        # Get activations unq_H
        unq_H = self._get_H(unq_X)

        for n_iter_ in range(self.max_iter):
            # Loop over batches
            for i, (unq_idx,
                    idx) in enumerate(batch_lookup(lookup, n=self.batch_size)):
                if i == n_batch - 1:
                    W_last = self.W_.copy()
                # Update the activations unq_H
                unq_H[unq_idx] = _multiplicative_update_h(
                    unq_V[unq_idx],
                    self.W_,
                    unq_H[unq_idx],
                    epsilon=1e-3,
                    max_iter=self.max_iter_e_step,
                    rescale_W=self.rescale_W,
                    gamma_shape_prior=self.gamma_shape_prior,
                    gamma_scale_prior=self.gamma_scale_prior)
                # Update the topics self.W_
                _multiplicative_update_w(unq_V[idx], self.W_, self.A_, self.B_,
                                         unq_H[idx], self.rescale_W, self.rho_)

                if i == n_batch - 1:
                    # Compute the norm of the update of W in the last batch
                    W_change = np.linalg.norm(self.W_ -
                                              W_last) / np.linalg.norm(W_last)

            if (W_change < self.tol) and (n_iter_ >= self.min_iter - 1):
                break  # Stop if the change in W is smaller than the tolerance

        # Update self.H_dict_ with the learned encoded vectors (activations)
        self.H_dict_.update(zip(unq_X, unq_H))
        return self

    def get_feature_names(self, n_labels=3, prefix=''):
        """ Deprecated, use "get_feature_names_out"
        """
        warnings.warn(
            "get_feature_names is deprecated in scikit-learn > 1.0. "
            "use get_feature_names_out instead",
            DeprecationWarning,
        )
        return self.get_feature_names_out(n_labels=n_labels, prefix=prefix)

    def get_feature_names_out(self, n_labels=3, prefix=''):
        """
        Returns the labels that best summarize the learned components/topics.
        For each topic, labels with highest activations are selected.
        
        Parameters
        ----------
        
        n_labels : int, default=3
            The number of labels used to describe each topic.
        
        Returns
        -------
        
        topic_labels : list of strings
            The labels that best describe each topic.
        
        """
        vectorizer = CountVectorizer()
        vectorizer.fit(list(self.H_dict_.keys()))
        if LooseVersion(sklearn_version) < LooseVersion('1.0'):
            vocabulary = np.array(vectorizer.get_feature_names())
        else:
            vocabulary = np.array(vectorizer.get_feature_names_out())
        encoding = self.transform(np.array(vocabulary).reshape(-1))
        encoding = abs(encoding)
        encoding = encoding / np.sum(encoding, axis=1, keepdims=True)
        n_components = encoding.shape[1]
        topic_labels = []
        for i in range(n_components):
            x = encoding[:, i]
            labels = vocabulary[np.argsort(-x)[:n_labels]]
            topic_labels.append(labels)
        topic_labels = [prefix + ', '.join(label) for label in topic_labels]
        return topic_labels

    def score(self, X):
        """
        Returns the Kullback-Leibler divergence between the n-grams counts
        matrix V of X, and its non-negative factorization HW.

        Parameters
        ----------
        X : array-like (str), shape (n_samples, )
            The data to encode.

        Returns
        -------
        kl_divergence : float.
            The Kullback-Leibler divergence.
        """
        # Build n-grams/word counts matrix
        unq_X, lookup = np.unique(X, return_inverse=True)
        unq_V = self.ngrams_count_.transform(unq_X)
        if self.add_words:
            unq_V2 = self.word_count_.transform(unq_X)
            unq_V = sparse.hstack((unq_V, unq_V2), format='csr')

        self._add_unseen_keys_to_H_dict(unq_X)
        unq_H = self._get_H(unq_X)
        # Given the learnt topics W, optimize the activations H to fit V = HW
        for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size):
            unq_H[slice] = _multiplicative_update_h(
                unq_V[slice],
                self.W_,
                unq_H[slice],
                epsilon=1e-3,
                max_iter=self.max_iter_e_step,
                rescale_W=self.rescale_W,
                gamma_shape_prior=self.gamma_shape_prior,
                gamma_scale_prior=self.gamma_scale_prior)
        # Compute the KL divergence between V and HW
        kl_divergence = _beta_divergence(unq_V[lookup],
                                         unq_H[lookup],
                                         self.W_,
                                         'kullback-leibler',
                                         square_root=False)
        return kl_divergence

    def partial_fit(self, X, y=None):
        """
        Partial fit of the GapEncoder on X.
        To be used in a online learning procedure where batches of data are
        coming one by one.

        Parameters
        ----------
        X : array-like, shape (n_samples, )
            The string data to fit the model on.
        
        Returns
        -------
        self
        
        """

        # Init H_dict_ with empty dict if it's the first call of partial_fit
        if not hasattr(self, 'H_dict_'):
            self.H_dict_ = dict()
        # Check if first item has str or np.str_ type
        assert isinstance(X[0], str), "ERROR: Input data is not string."
        # Check if it is not the first batch
        if hasattr(self, 'vocabulary'):  # Update unq_X, unq_V with new batch
            unq_X, lookup = np.unique(X, return_inverse=True)
            unq_V = self.ngrams_count_.transform(unq_X)
            if self.add_words:
                unq_V2 = self.word_count_.transform(unq_X)
                unq_V = sparse.hstack((unq_V, unq_V2), format='csr')

            unseen_X = np.setdiff1d(unq_X, np.array([*self.H_dict_]))
            unseen_V = self.ngrams_count_.transform(unseen_X)
            if self.add_words:
                unseen_V2 = self.word_count_.transform(unseen_X)
                unseen_V = sparse.hstack((unseen_V, unseen_V2), format='csr')

            if unseen_V.shape[0] != 0:
                unseen_H = _rescale_h(
                    unseen_V, np.ones((len(unseen_X), self.n_components)))
                for x, h in zip(unseen_X, unseen_H):
                    self.H_dict_[x] = h
                del unseen_H
            del unseen_X, unseen_V
        else:  # If it is the first batch, call _init_vars to init unq_X, unq_V
            unq_X, unq_V, lookup = self._init_vars(X)

        unq_H = self._get_H(unq_X)
        # Update the activations unq_H
        unq_H = _multiplicative_update_h(
            unq_V,
            self.W_,
            unq_H,
            epsilon=1e-3,
            max_iter=self.max_iter_e_step,
            rescale_W=self.rescale_W,
            gamma_shape_prior=self.gamma_shape_prior,
            gamma_scale_prior=self.gamma_scale_prior)
        # Update the topics self.W_
        _multiplicative_update_w(unq_V[lookup], self.W_, self.A_, self.B_,
                                 unq_H[lookup], self.rescale_W, self.rho_)
        # Update self.H_dict_ with the learned encoded vectors (activations)
        self.H_dict_.update(zip(unq_X, unq_H))
        return self

    def _add_unseen_keys_to_H_dict(self, X):
        """
        Add activations of unseen string categories from X to H_dict.
        """
        unseen_X = np.setdiff1d(X, np.array([*self.H_dict_]))
        if unseen_X.size > 0:
            unseen_V = self.ngrams_count_.transform(unseen_X)
            if self.add_words:
                unseen_V2 = self.word_count_.transform(unseen_X)
                unseen_V = sparse.hstack((unseen_V, unseen_V2), format='csr')

            unseen_H = _rescale_h(
                unseen_V, np.ones((unseen_V.shape[0], self.n_components)))
            self.H_dict_.update(zip(unseen_X, unseen_H))

    def transform(self, X):
        """
        Return the encoded vectors (activations) H of input strings in X.
        Given the learnt topics W, the activations H are tuned to fit V = HW.

        Parameters
        ----------
        X : array-like, shape (n_samples)
            The string data to encode.

        Returns
        -------
        H : 2-d array, shape (n_samples, n_topics)
            Transformed input.
        """
        # Check if first item has str or np.str_ type
        assert isinstance(X[0], str), "ERROR: Input data is not string."
        unq_X = np.unique(X)
        # Build the n-grams counts matrix V for the string data to encode
        unq_V = self.ngrams_count_.transform(unq_X)
        if self.add_words:  # Add words counts
            unq_V2 = self.word_count_.transform(unq_X)
            unq_V = sparse.hstack((unq_V, unq_V2), format='csr')
        # Add unseen strings in X to H_dict
        self._add_unseen_keys_to_H_dict(unq_X)
        unq_H = self._get_H(unq_X)
        # Loop over batches
        for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size):
            # Given the learnt topics W, optimize H to fit V = HW
            unq_H[slice] = _multiplicative_update_h(
                unq_V[slice],
                self.W_,
                unq_H[slice],
                epsilon=1e-3,
                max_iter=100,
                rescale_W=self.rescale_W,
                gamma_shape_prior=self.gamma_shape_prior,
                gamma_scale_prior=self.gamma_scale_prior)
        # Store and return the encoded vectors of X
        self.H_dict_.update(zip(unq_X, unq_H))
        return self._get_H(X)
Example #13
0
  'ke tla cheka harddrive at home pc crashed phone dead so you just hoping for miracles brother',
  'did i crash at pm yesterday and woke up just now',
  'its important to uphold the most righteous action not just something akin to islam or that doesnt contra'
  'minago earthquake has hit canas barrio ponce puerto rico mi am ast rspr',
  'earthquakes in puerto rico in the last days the earth keeps shaking families are sleeping on the st',
  'it isnt racism is a sad fact of life your difference will be exploited by ignora',
  'i learnt my trade here bl back then i can see at least of my old offices workshops in this photo sad to see it g',
  'hachisan leon and claire they fight bioterrorism and have busy days thats why they value the time they spend'	,
  'usmexico border portal for tb swine flu bioterrorism via',
  'this comment section is f*****g cancer this laila person is probably wrong vaushs point was misunderstoo',
]

# criando um countvectorizer e uma matriz esparsa representando as sentenças
cv = CountVectorizer(stop_words='english')
array = cv.fit_transform(corpus).toarray()
pd.DataFrame(array, columns=cv.get_feature_names_out())

# função para plotar o dataframe com os os valores de similaridade, para melhor visualização
def create_dataframe(matrix, tokens):
    doc_names = [f'{tokens[i]}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return df

# calculando a similaridade por coseno entre sentenças do array
cosine_similarity_matrix = cosine_similarity(array)
results = create_dataframe(cosine_similarity_matrix, corpus)
results

"""**a)** Representação vetorial TF-IDF com similaridade do cosseno

---
Example #14
0
data_cleaned = []
for doc in groups.data:
    doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha())
    data_cleaned.append(doc_cleaned)

from sklearn.feature_extraction import stop_words
print(stop_words.ENGLISH_STOP_WORDS)

from nltk.corpus import names
all_names = set(names.words())

count_vector_sw = CountVectorizer(stop_words="english", max_features=500)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

data_cleaned = []

for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(
        lemmatizer.lemmatize(word) for word in doc.split()
        if word.isalpha() and word not in all_names)
    data_cleaned.append(doc_cleaned)

data_cleaned_count = count_vector_sw.fit_transform(data_cleaned)

print(count_vector_sw.get_feature_names_out())

# In[ ]:
Example #15
0
## print(Counter(toks))

#### Word Stemming
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in toks]
print(stemmed)


print(len(toks))
print(len(stemmed))


#### Count Vecotrizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(raw)
print(vectorizer.get_feature_names_out())
X = X.toarray()

ingr_dict = {}
for i in range(0, X.shape[1]):
    for j, ingredient in enumerate(vectorizer.get_feature_names_out()):
        if ingredient not in ingr_dict:
            ingr_dict[ingredient] = 1
            
        print(ingredient)


# for i, ingredient in enumerate(vectorizer.get_feature_names_out()):
#     print(ingredient)
#     print(len(X.toarray()[i]))