def extract_significant_terms_from_subset(data_frame: pandas.DataFrame, subset_data_frame: pandas.DataFrame, field_name: str, vectorizer: CountVectorizer = CountVectorizer(encoding="latin1", lowercase=True, max_features=500)) -> pandas.Series: """ Returns interesting or unusual occurrences of terms in a subset. Based on the `elasticsearch significant_text aggregation <https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html#_scripted>`_ :param data_frame: the full data set. :param subset_data_frame: the subset partition data, with over it the scoring will be calculated. Can a filter by feature or other boolean criteria. :param field_name: the feature to parse. :param vectorizer: text count vectorizer which converts collection of text to a matrix of token counts. See more info `here <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ . :return: Series of terms with scoring over the subset. :author: `Eran Hirsch <https://github.com/eranhirs>`_ """ count_matrix = vectorizer.fit_transform(data_frame[field_name].dropna()) matrix_df = pandas.DataFrame(count_matrix.toarray(), columns=vectorizer.get_feature_names_out()) subset_X = vectorizer.transform(subset_data_frame[field_name].dropna()) subset_matrix_df = pandas.DataFrame(subset_X.toarray(), columns=vectorizer.get_feature_names_out()) subset_freq = subset_matrix_df.sum() superset_freq = matrix_df.sum() return (subset_freq / (superset_freq - subset_freq + 1)).sort_values(ascending=False)
def get_feature_names_out(self, n_labels=3, prefix=''): """ Returns the labels that best summarize the learned components/topics. For each topic, labels with highest activations are selected. Parameters ---------- n_labels : int, default=3 The number of labels used to describe each topic. Returns ------- topic_labels : list of strings The labels that best describe each topic. """ vectorizer = CountVectorizer() vectorizer.fit(list(self.H_dict_.keys())) if LooseVersion(sklearn_version) < LooseVersion('1.0'): vocabulary = np.array(vectorizer.get_feature_names()) else: vocabulary = np.array(vectorizer.get_feature_names_out()) encoding = self.transform(np.array(vocabulary).reshape(-1)) encoding = abs(encoding) encoding = encoding / np.sum(encoding, axis=1, keepdims=True) n_components = encoding.shape[1] topic_labels = [] for i in range(n_components): x = encoding[:, i] labels = vocabulary[np.argsort(-x)[:n_labels]] topic_labels.append(labels) topic_labels = [prefix + ', '.join(label) for label in topic_labels] return topic_labels
class CountVectorFeatureExtractor(BaseTransformer): def __init__(self, col): self.vec_count = CountVectorizer() self.col = col def fit(self, df): self.vec_count.fit(df[self.col].values) def transform(self, df): X = self.vec_count.transform(df[self.col].values) X = pd.DataFrame(X.toarray(), columns=self.vec_count.get_feature_names_out()) return X def fit_transform(self, df, y=None, **fit_params): self.vec_count.fit(df[self.col].values) X = self.vec_count.transform(df[self.col].values) X = pd.DataFrame(X.toarray(), columns=self.vec_count.get_feature_names_out()) return X
def append_tags_to_frame(X_train: pandas.DataFrame, X_test: pandas.DataFrame, field_name: str, prefix: Optional[str] = "", max_features: Optional[int] = 500, min_df: Union[int, float] = 1, lowercase=False, tokenizer: Optional[Callable[[str], List[str]]] = _tokenize) -> Tuple[ pandas.DataFrame, pandas.DataFrame]: """ Extracts tags from a given field and append them as dataframe. :param X_train: Pandas' dataframe with the train features. :param X_test: Pandas' dataframe with the test features. :param field_name: the feature to parse. :param prefix: the given prefix for new tag feature. :param max_features: int or None, default=500. max tags names to consider. :param min_df: float in range [0.0, 1.0] or int, default=1. When building the tag name set ignore tags that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. :param lowercase: boolean, default=False. Convert all characters to lowercase before tokenizing the tag names. :param tokenizer: callable or None. Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Default splits by ",", and retain alphanumeric characters with special characters "_", "$" and "-". :return: the train and test with tags appended. """ vectorizer = CountVectorizer(binary=True, tokenizer=tokenizer, encoding="latin1", lowercase=lowercase, min_df=min_df, max_features=max_features) x_train_count_matrix = vectorizer.fit_transform(X_train[field_name].dropna()) x_train_tags = pandas.DataFrame(x_train_count_matrix.toarray(), columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()]) x_train_tags.index = X_train.index x_test_count_matrix = vectorizer.transform(X_test[field_name].dropna()) x_test_tags = pandas.DataFrame(x_test_count_matrix.toarray(), columns=[prefix + tag_name for tag_name in vectorizer.get_feature_names_out()]) x_test_tags.index = X_test.index x_train_reduced = X_train.drop(columns=[field_name]) x_test_reduced = X_test.drop(columns=[field_name]) return pandas.merge(x_train_reduced, x_train_tags, left_index=True, right_index=True, how="left"), pandas.merge( x_test_reduced, x_test_tags, left_index=True, right_index=True, how="left")
def _hypergeom_clusters( cluster_labels: np.ndarray, keywords: List[List[str]], fdr_threshold: float, n_words: int ) -> Tuple[Dict[int, List[str]], np.ndarray, np.ndarray, np.ndarray]: keywords = [[w for w, _ in doc_keywords] for doc_keywords in keywords] clusters_keywords = {} for label in sorted(set(cluster_labels) - {-1}): indices = set(np.flatnonzero(cluster_labels == label)) kwds = [k for i, k in enumerate(keywords) if i in indices] clusters_keywords[label] = kwds cv = CountVectorizer(tokenizer=lambda w: w, preprocessor=lambda w: w) X = cv.fit_transform(list(chain.from_iterable(clusters_keywords.values()))) all_keywords = np.array(cv.get_feature_names_out()) index = 0 selected_clusters_keywords = {} all_scores, all_p_values = [], [] for label, cls_kwds in clusters_keywords.items(): # find words that should be specific for a group with hypergeom test n_docs = len(cls_kwds) p_values = hypergeom_p_values(X, X[index:index + n_docs]) words = set(all_keywords[np.array(p_values) < fdr_threshold]) # select only words with p-values less than threshold sel_words = [w for w in chain.from_iterable(cls_kwds)] sel_words = [w for w in sel_words if w in words] sel_words = [(w, c / n_docs) for w, c in Counter(sel_words).most_common(n_words)] selected_clusters_keywords[label] = sel_words all_scores.append(X[index:index + n_docs].sum(axis=0) / n_docs) all_p_values.append(p_values) index += n_docs all_scores = np.vstack(all_scores) all_p_values = np.vstack(all_p_values) return selected_clusters_keywords, all_keywords, all_scores, all_p_values
''' Text Feature Extraction ''' # Using CountVectorizer import numpy as np from sklearn.feature_extraction.text import CountVectorizer text = [ "In Jigsaw's fourth Kaggle competition", "When we ask human judges to look at", "to decide which ones are toxic and" ] counterVec = CountVectorizer() counterVec.fit(text) print("Get Feature Names") print(counterVec.get_feature_names_out()) print("The number of feature is {}".format( len(counterVec.get_feature_names_out()))) extracted_features = counterVec.transform(text) print(extracted_features.toarray().shape) print(extracted_features.toarray())
doc = doc.lower() doc_cleaned = ' '.join( lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names) data_cleaned.append(doc_cleaned) from sklearn.feature_extraction.text import CountVectorizer count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2) data = count_vector.fit_transform(data_cleaned) from sklearn.decomposition import LatentDirichletAllocation t = 20 lda = LatentDirichletAllocation(n_components=t, learning_method='batch', random_state=42) lda.fit(data) print(lda.components_) terms = count_vector.get_feature_names_out() for topic_idx, topic in enumerate(lda.components_): print("Topic {}:".format(topic_idx)) print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
def _ngrams_split(self,text): vectorizer = CountVectorizer(ngram_range=self.ngram_range) _ = vectorizer.fit_transform([text]) return list(vectorizer.get_feature_names_out())
pickle.dump((result, names), fd) pass os.makedirs(sys.argv[2], exist_ok=True) # Generate train feature matrix df_train = get_df(train_input) train_words = np.array(df_train.text.str.lower().values.astype("U")) bag_of_words = CountVectorizer( stop_words="english", max_features=max_features, ngram_range=(1, ngrams) ) bag_of_words.fit(train_words) train_words_binary_matrix = bag_of_words.transform(train_words) feature_names = bag_of_words.get_feature_names_out() tfidf = TfidfTransformer(smooth_idf=False) tfidf.fit(train_words_binary_matrix) train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output) # Generate test feature matrix df_test = get_df(test_input) test_words = np.array(df_test.text.str.lower().values.astype("U")) test_words_binary_matrix = bag_of_words.transform(test_words) test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
(n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words( nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (generalized Kullback-Leibler divergence)') print( '\n' * 2, "Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tf) print("done in %0.3fs." % (time() - t0)) tf_feature_names = tf_vectorizer.get_feature_names_out() plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')
1, len(abstracts), value=3) st.markdown("### NMF clusters") H1, W1 = run_model(vectors, "nmf", n_clusters) r = show_topics(H1, num_top_words) r st.markdown("Membership:") for i in range(n_clusters): m = get_members_for_cluster(W1, i) st.write(f"Cluster {i}: {m}") add_abstract_viewer(abstracts, 2) st.markdown("### TF-IDF Clusters") H1, W1 = run_model(vectors, "tfidf", n_clusters) r = show_topics(H1, num_top_words) r st.markdown("Membership:") for i in range(n_clusters): m = get_members_for_cluster(W1, i) st.write(f"Cluster {i}: {m}") add_abstract_viewer(abstracts, 3) abstracts = get_data() vectorizer = CountVectorizer(stop_words='english') vectors = vectorizer.fit_transform(abstracts).todense() vocab = np.array(vectorizer.get_feature_names_out()) main(abstracts=abstracts, vectors=vectors)
class GapEncoderColumn(BaseEstimator, TransformerMixin): """See GapEncoder's docstring.""" def __init__(self, n_components=10, batch_size=128, gamma_shape_prior=1.1, gamma_scale_prior=1.0, rho=.95, rescale_rho=False, hashing=False, hashing_n_features=2**12, init='k-means++', tol=1e-4, min_iter=2, max_iter=5, ngram_range=(2, 4), analyzer='char', add_words=False, random_state=None, rescale_W=True, max_iter_e_step=20): self.ngram_range = ngram_range self.n_components = n_components self.gamma_shape_prior = gamma_shape_prior # 'a' parameter self.gamma_scale_prior = gamma_scale_prior # 'b' parameter self.rho = rho self.rho_ = self.rho self.rescale_rho = rescale_rho self.batch_size = batch_size self.tol = tol self.hashing = hashing self.hashing_n_features = hashing_n_features self.max_iter = max_iter self.min_iter = min_iter self.init = init self.analyzer = analyzer self.add_words = add_words self.random_state = check_random_state(random_state) self.rescale_W = rescale_W self.max_iter_e_step = max_iter_e_step def _init_vars(self, X): """ Build the bag-of-n-grams representation V of X and initialize the topics W. """ # Init n-grams counts vectorizer if self.hashing: self.ngrams_count_ = HashingVectorizer( analyzer=self.analyzer, ngram_range=self.ngram_range, n_features=self.hashing_n_features, norm=None, alternate_sign=False) if self.add_words: # Init a word counts vectorizer if needed self.word_count_ = HashingVectorizer( analyzer='word', n_features=self.hashing_n_features, norm=None, alternate_sign=False) else: self.ngrams_count_ = CountVectorizer(analyzer=self.analyzer, ngram_range=self.ngram_range, dtype=np.float64) if self.add_words: self.word_count_ = CountVectorizer(dtype=np.float64) # Init H_dict_ with empty dict to train from scratch self.H_dict_ = dict() # Build the n-grams counts matrix unq_V on unique elements of X unq_X, lookup = np.unique(X, return_inverse=True) unq_V = self.ngrams_count_.fit_transform(unq_X) if self.add_words: # Add word counts to unq_V unq_V2 = self.word_count_.fit_transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') if not self.hashing: # Build n-grams/word vocabulary if LooseVersion(sklearn_version) < LooseVersion('1.0'): self.vocabulary = self.ngrams_count_.get_feature_names() else: self.vocabulary = self.ngrams_count_.get_feature_names_out() if self.add_words: if LooseVersion(sklearn_version) < LooseVersion('1.0'): self.vocabulary = np.concatenate( (self.vocabulary, self.word_count_.get_feature_names())) else: self.vocabulary = np.concatenate( (self.vocabulary, self.word_count_.get_feature_names_out())) _, self.n_vocab = unq_V.shape # Init the topics W given the n-grams counts V self.W_, self.A_, self.B_ = self._init_w(unq_V[lookup], X) # Init the activations unq_H of each unique input string unq_H = _rescale_h(unq_V, np.ones((len(unq_X), self.n_components))) # Update self.H_dict_ with unique input strings and their activations self.H_dict_.update(zip(unq_X, unq_H)) if self.rescale_rho: # Make update rate per iteration independant of the batch_size self.rho_ = self.rho**(self.batch_size / len(X)) return unq_X, unq_V, lookup def _get_H(self, X): """ Return the bag-of-n-grams representation of X. """ H_out = np.empty((len(X), self.n_components)) for x, h_out in zip(X, H_out): h_out[:] = self.H_dict_[x] return H_out def _init_w(self, V, X): """ Initialize the topics W. If self.init='k-means++', we use the init method of sklearn.cluster.KMeans. If self.init='random', topics are initialized with a Gamma distribution. If self.init='k-means', topics are initialized with a KMeans on the n-grams counts. """ if self.init == 'k-means++': if LooseVersion(sklearn_version) < LooseVersion('0.24'): W = _k_init(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W, _ = kmeans_plusplus(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) W = W + .1 # To avoid restricting topics to few n-grams only elif self.init == 'random': W = self.random_state.gamma(shape=self.gamma_shape_prior, scale=self.gamma_scale_prior, size=(self.n_components, self.n_vocab)) elif self.init == 'k-means': prototypes = get_kmeans_prototypes(X, self.n_components, analyzer=self.analyzer, random_state=self.random_state) W = self.ngrams_count_.transform(prototypes).A + .1 if self.add_words: W2 = self.word_count_.transform(prototypes).A + .1 W = np.hstack((W, W2)) # if k-means doesn't find the exact number of prototypes if W.shape[0] < self.n_components: if LooseVersion(sklearn_version) < LooseVersion('0.24'): W2 = _k_init(V, self.n_components - W.shape[0], x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W2, _ = kmeans_plusplus(V, self.n_components - W.shape[0], x_squared_norms=row_norms( V, squared=True), random_state=self.random_state, n_local_trials=None) W2 = W2 + .1 W = np.concatenate((W, W2), axis=0) else: raise AttributeError('Initialization method %s does not exist.' % self.init) W /= W.sum(axis=1, keepdims=True) A = np.ones((self.n_components, self.n_vocab)) * 1e-10 B = A.copy() return W, A, B def fit(self, X, y=None): """ Fit the GapEncoder on batches of X. Parameters ---------- X : array-like, shape (n_samples, ) The string data to fit the model on. Returns ------- self """ # Check if first item has str or np.str_ type assert isinstance(X[0], str), "ERROR: Input data is not string." # Make n-grams counts matrix unq_V unq_X, unq_V, lookup = self._init_vars(X) n_batch = (len(X) - 1) // self.batch_size + 1 del X # Get activations unq_H unq_H = self._get_H(unq_X) for n_iter_ in range(self.max_iter): # Loop over batches for i, (unq_idx, idx) in enumerate(batch_lookup(lookup, n=self.batch_size)): if i == n_batch - 1: W_last = self.W_.copy() # Update the activations unq_H unq_H[unq_idx] = _multiplicative_update_h( unq_V[unq_idx], self.W_, unq_H[unq_idx], epsilon=1e-3, max_iter=self.max_iter_e_step, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) # Update the topics self.W_ _multiplicative_update_w(unq_V[idx], self.W_, self.A_, self.B_, unq_H[idx], self.rescale_W, self.rho_) if i == n_batch - 1: # Compute the norm of the update of W in the last batch W_change = np.linalg.norm(self.W_ - W_last) / np.linalg.norm(W_last) if (W_change < self.tol) and (n_iter_ >= self.min_iter - 1): break # Stop if the change in W is smaller than the tolerance # Update self.H_dict_ with the learned encoded vectors (activations) self.H_dict_.update(zip(unq_X, unq_H)) return self def get_feature_names(self, n_labels=3, prefix=''): """ Deprecated, use "get_feature_names_out" """ warnings.warn( "get_feature_names is deprecated in scikit-learn > 1.0. " "use get_feature_names_out instead", DeprecationWarning, ) return self.get_feature_names_out(n_labels=n_labels, prefix=prefix) def get_feature_names_out(self, n_labels=3, prefix=''): """ Returns the labels that best summarize the learned components/topics. For each topic, labels with highest activations are selected. Parameters ---------- n_labels : int, default=3 The number of labels used to describe each topic. Returns ------- topic_labels : list of strings The labels that best describe each topic. """ vectorizer = CountVectorizer() vectorizer.fit(list(self.H_dict_.keys())) if LooseVersion(sklearn_version) < LooseVersion('1.0'): vocabulary = np.array(vectorizer.get_feature_names()) else: vocabulary = np.array(vectorizer.get_feature_names_out()) encoding = self.transform(np.array(vocabulary).reshape(-1)) encoding = abs(encoding) encoding = encoding / np.sum(encoding, axis=1, keepdims=True) n_components = encoding.shape[1] topic_labels = [] for i in range(n_components): x = encoding[:, i] labels = vocabulary[np.argsort(-x)[:n_labels]] topic_labels.append(labels) topic_labels = [prefix + ', '.join(label) for label in topic_labels] return topic_labels def score(self, X): """ Returns the Kullback-Leibler divergence between the n-grams counts matrix V of X, and its non-negative factorization HW. Parameters ---------- X : array-like (str), shape (n_samples, ) The data to encode. Returns ------- kl_divergence : float. The Kullback-Leibler divergence. """ # Build n-grams/word counts matrix unq_X, lookup = np.unique(X, return_inverse=True) unq_V = self.ngrams_count_.transform(unq_X) if self.add_words: unq_V2 = self.word_count_.transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') self._add_unseen_keys_to_H_dict(unq_X) unq_H = self._get_H(unq_X) # Given the learnt topics W, optimize the activations H to fit V = HW for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size): unq_H[slice] = _multiplicative_update_h( unq_V[slice], self.W_, unq_H[slice], epsilon=1e-3, max_iter=self.max_iter_e_step, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) # Compute the KL divergence between V and HW kl_divergence = _beta_divergence(unq_V[lookup], unq_H[lookup], self.W_, 'kullback-leibler', square_root=False) return kl_divergence def partial_fit(self, X, y=None): """ Partial fit of the GapEncoder on X. To be used in a online learning procedure where batches of data are coming one by one. Parameters ---------- X : array-like, shape (n_samples, ) The string data to fit the model on. Returns ------- self """ # Init H_dict_ with empty dict if it's the first call of partial_fit if not hasattr(self, 'H_dict_'): self.H_dict_ = dict() # Check if first item has str or np.str_ type assert isinstance(X[0], str), "ERROR: Input data is not string." # Check if it is not the first batch if hasattr(self, 'vocabulary'): # Update unq_X, unq_V with new batch unq_X, lookup = np.unique(X, return_inverse=True) unq_V = self.ngrams_count_.transform(unq_X) if self.add_words: unq_V2 = self.word_count_.transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') unseen_X = np.setdiff1d(unq_X, np.array([*self.H_dict_])) unseen_V = self.ngrams_count_.transform(unseen_X) if self.add_words: unseen_V2 = self.word_count_.transform(unseen_X) unseen_V = sparse.hstack((unseen_V, unseen_V2), format='csr') if unseen_V.shape[0] != 0: unseen_H = _rescale_h( unseen_V, np.ones((len(unseen_X), self.n_components))) for x, h in zip(unseen_X, unseen_H): self.H_dict_[x] = h del unseen_H del unseen_X, unseen_V else: # If it is the first batch, call _init_vars to init unq_X, unq_V unq_X, unq_V, lookup = self._init_vars(X) unq_H = self._get_H(unq_X) # Update the activations unq_H unq_H = _multiplicative_update_h( unq_V, self.W_, unq_H, epsilon=1e-3, max_iter=self.max_iter_e_step, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) # Update the topics self.W_ _multiplicative_update_w(unq_V[lookup], self.W_, self.A_, self.B_, unq_H[lookup], self.rescale_W, self.rho_) # Update self.H_dict_ with the learned encoded vectors (activations) self.H_dict_.update(zip(unq_X, unq_H)) return self def _add_unseen_keys_to_H_dict(self, X): """ Add activations of unseen string categories from X to H_dict. """ unseen_X = np.setdiff1d(X, np.array([*self.H_dict_])) if unseen_X.size > 0: unseen_V = self.ngrams_count_.transform(unseen_X) if self.add_words: unseen_V2 = self.word_count_.transform(unseen_X) unseen_V = sparse.hstack((unseen_V, unseen_V2), format='csr') unseen_H = _rescale_h( unseen_V, np.ones((unseen_V.shape[0], self.n_components))) self.H_dict_.update(zip(unseen_X, unseen_H)) def transform(self, X): """ Return the encoded vectors (activations) H of input strings in X. Given the learnt topics W, the activations H are tuned to fit V = HW. Parameters ---------- X : array-like, shape (n_samples) The string data to encode. Returns ------- H : 2-d array, shape (n_samples, n_topics) Transformed input. """ # Check if first item has str or np.str_ type assert isinstance(X[0], str), "ERROR: Input data is not string." unq_X = np.unique(X) # Build the n-grams counts matrix V for the string data to encode unq_V = self.ngrams_count_.transform(unq_X) if self.add_words: # Add words counts unq_V2 = self.word_count_.transform(unq_X) unq_V = sparse.hstack((unq_V, unq_V2), format='csr') # Add unseen strings in X to H_dict self._add_unseen_keys_to_H_dict(unq_X) unq_H = self._get_H(unq_X) # Loop over batches for slice in gen_batches(n=unq_H.shape[0], batch_size=self.batch_size): # Given the learnt topics W, optimize H to fit V = HW unq_H[slice] = _multiplicative_update_h( unq_V[slice], self.W_, unq_H[slice], epsilon=1e-3, max_iter=100, rescale_W=self.rescale_W, gamma_shape_prior=self.gamma_shape_prior, gamma_scale_prior=self.gamma_scale_prior) # Store and return the encoded vectors of X self.H_dict_.update(zip(unq_X, unq_H)) return self._get_H(X)
'ke tla cheka harddrive at home pc crashed phone dead so you just hoping for miracles brother', 'did i crash at pm yesterday and woke up just now', 'its important to uphold the most righteous action not just something akin to islam or that doesnt contra' 'minago earthquake has hit canas barrio ponce puerto rico mi am ast rspr', 'earthquakes in puerto rico in the last days the earth keeps shaking families are sleeping on the st', 'it isnt racism is a sad fact of life your difference will be exploited by ignora', 'i learnt my trade here bl back then i can see at least of my old offices workshops in this photo sad to see it g', 'hachisan leon and claire they fight bioterrorism and have busy days thats why they value the time they spend' , 'usmexico border portal for tb swine flu bioterrorism via', 'this comment section is f*****g cancer this laila person is probably wrong vaushs point was misunderstoo', ] # criando um countvectorizer e uma matriz esparsa representando as sentenças cv = CountVectorizer(stop_words='english') array = cv.fit_transform(corpus).toarray() pd.DataFrame(array, columns=cv.get_feature_names_out()) # função para plotar o dataframe com os os valores de similaridade, para melhor visualização def create_dataframe(matrix, tokens): doc_names = [f'{tokens[i]}' for i, _ in enumerate(matrix)] df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens) return df # calculando a similaridade por coseno entre sentenças do array cosine_similarity_matrix = cosine_similarity(array) results = create_dataframe(cosine_similarity_matrix, corpus) results """**a)** Representação vetorial TF-IDF com similaridade do cosseno ---
data_cleaned = [] for doc in groups.data: doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha()) data_cleaned.append(doc_cleaned) from sklearn.feature_extraction import stop_words print(stop_words.ENGLISH_STOP_WORDS) from nltk.corpus import names all_names = set(names.words()) count_vector_sw = CountVectorizer(stop_words="english", max_features=500) from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() data_cleaned = [] for doc in groups.data: doc = doc.lower() doc_cleaned = ' '.join( lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names) data_cleaned.append(doc_cleaned) data_cleaned_count = count_vector_sw.fit_transform(data_cleaned) print(count_vector_sw.get_feature_names_out()) # In[ ]:
## print(Counter(toks)) #### Word Stemming porter = PorterStemmer() stemmed = [porter.stem(word) for word in toks] print(stemmed) print(len(toks)) print(len(stemmed)) #### Count Vecotrizer vectorizer = CountVectorizer() X = vectorizer.fit_transform(raw) print(vectorizer.get_feature_names_out()) X = X.toarray() ingr_dict = {} for i in range(0, X.shape[1]): for j, ingredient in enumerate(vectorizer.get_feature_names_out()): if ingredient not in ingr_dict: ingr_dict[ingredient] = 1 print(ingredient) # for i, ingredient in enumerate(vectorizer.get_feature_names_out()): # print(ingredient) # print(len(X.toarray()[i]))