def test_extract_topics_custom_cv(): """ Test Topic Extraction with custom Countvectorizer Test whether topics could be extracted using c-TF-IDF. Checks are related to the existence of topic representation, not so much whether they make sense semantically. """ nr_topics = 5 documents = pd.DataFrame({"Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))}) cv = CountVectorizer(ngram_range=(1, 2)) model = BERTopic(vectorizer_model=cv) model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens") model._update_topic_size(documents) model._extract_topics(documents) freq = model.get_topic_freq() assert model.c_tf_idf.shape[0] == 5 assert model.c_tf_idf.shape[1] > 100 assert isinstance(freq, pd.DataFrame) assert nr_topics == len(freq.Topic.unique()) assert freq.Count.sum() == len(documents) assert len(freq.Topic.unique()) == len(freq)
def test_topic_reduction(reduced_topics): """ Test Topic Reduction The the reduction of topics after having generated topics. This generation of the initial topics is done manually as the training takes quite a while. """ nr_topics = reduced_topics + 2 model = BERTopic(nr_topics=reduced_topics) model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens") old_documents = pd.DataFrame({"Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))}) model._update_topic_size(old_documents) model._extract_topics(old_documents.copy()) old_freq = model.get_topic_freq() new_documents = model._reduce_topics(old_documents.copy()) new_freq = model.get_topic_freq() assert old_freq.Count.sum() == new_freq.Count.sum() assert len(old_freq.Topic.unique()) == len(old_freq) assert len(new_freq.Topic.unique()) == len(new_freq) assert isinstance(model.mapped_topics, dict) assert not set(model.get_topic_freq().Topic).difference(set(new_documents.Topic)) assert model.mapped_topics
def test_topic_reduction(reduced_topics): """ Test Topic Reduction The the reduction of topics after having generated topics. This generation of the initial topics is done manually as the training takes quite a while. """ nr_topics = reduced_topics + 2 model = BERTopic(nr_topics=reduced_topics) model.embedding_model = select_backend("all-MiniLM-L6-v2") topics = np.random.randint(-1, nr_topics - 1, len(newsgroup_docs)) old_documents = pd.DataFrame({ "Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": topics }) model.hdbscan_model.labels_ = topics model.topic_mapper = TopicMapper(model.hdbscan_model) model._update_topic_size(old_documents) old_documents = model._sort_mappings_by_frequency(old_documents) model._extract_topics(old_documents.copy()) old_freq = model.get_topic_freq() new_documents = model._reduce_topics(old_documents.copy()) new_freq = model.get_topic_freq() assert old_freq.Count.sum() == new_freq.Count.sum() assert len(old_freq.Topic.unique()) == len(old_freq) assert len(new_freq.Topic.unique()) == len(new_freq) assert not set(model.get_topic_freq().Topic).difference( set(new_documents.Topic))
def test_extract_topics(): """ Test Topic Extraction Test whether topics could be extracted using c-TF-IDF. Checks are related to the existence of topic representation, not so much whether they make sense semantically. """ nr_topics = 5 documents = pd.DataFrame({ "Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": np.random.randint(-1, nr_topics - 1, len(newsgroup_docs)) }) model = BERTopic() model.embedding_model = select_backend("all-MiniLM-L6-v2") model._update_topic_size(documents) model._extract_topics(documents) freq = model.get_topic_freq() assert model.c_tf_idf.shape[0] == 5 assert model.c_tf_idf.shape[1] > 100 assert isinstance(freq, pd.DataFrame) assert nr_topics == len(freq.Topic.unique()) assert freq.Count.sum() == len(documents) assert len(freq.Topic.unique()) == len(freq)
def test_topic_reduction_edge_cases(): """ Test Topic Reduction Large Nr Topics Test whether the topics are not reduced if the reduced number of topics exceeds the actual number of topics found """ nr_topics = 5 topics = np.random.randint(-1, nr_topics - 1, len(newsgroup_docs)) model = BERTopic() model.embedding_model = select_backend("all-MiniLM-L6-v2") model.nr_topics = 100 model.hdbscan_model.labels_ = topics old_documents = pd.DataFrame({ "Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": topics }) model._update_topic_size(old_documents) model._extract_topics(old_documents) old_freq = model.get_topic_freq() new_documents = model._reduce_topics(old_documents) new_freq = model.get_topic_freq() assert not set(old_documents.Topic).difference(set(new_documents.Topic)) pd.testing.assert_frame_equal(old_documents, new_documents) pd.testing.assert_frame_equal(old_freq, new_freq)
def test_extract_embeddings_compare(): """ Test SentenceTransformer with BERTopic Test if the correct embedding model is loaded in BERTopic and whether BERTopic embeddings match the sentence-transformers embeddings. """ docs = ["some document"] model = BERTopic(embedding_model="all-MiniLM-L6-v2") model.embedding_model = select_backend("all-MiniLM-L6-v2") bertopic_embeddings = model._extract_embeddings(docs) assert isinstance(bertopic_embeddings, np.ndarray) assert bertopic_embeddings.shape == (1, 384) sentence_embeddings = embedding_model.encode(docs, show_progress_bar=False) assert np.array_equal(bertopic_embeddings, sentence_embeddings)
def test_extract_embeddings(base_bertopic): """ Test SentenceTransformer Check whether the embeddings are correctly generated for both a single string or a list of strings. This means that the correct shape should be outputted. The embeddings by itself should not exceed certain values as a sanity check. """ base_bertopic.embedding_model = select_backend("all-MiniLM-L6-v2") single_embedding = base_bertopic._extract_embeddings("a document") multiple_embeddings = base_bertopic._extract_embeddings(["a document", "another document"]) assert single_embedding.shape[0] == 1 assert single_embedding.shape[1] == 384 assert np.min(single_embedding) > -5 assert np.max(single_embedding) < 5 assert multiple_embeddings.shape[0] == 2 assert multiple_embeddings.shape[1] == 384 assert np.min(multiple_embeddings) > -5 assert np.max(multiple_embeddings) < 5
def test_topic_reduction_edge_cases(): """ Test Topic Reduction Large Nr Topics Test whether the topics are not reduced if the reduced number of topics exceeds the actual number of topics found """ model = BERTopic() model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens") nr_topics = 5 model.nr_topics = 100 old_documents = pd.DataFrame({"Document": newsgroup_docs, "ID": range(len(newsgroup_docs)), "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))}) model._update_topic_size(old_documents) model._extract_topics(old_documents) old_freq = model.get_topic_freq() new_documents = model._reduce_topics(old_documents) new_freq = model.get_topic_freq() assert not set(old_documents.Topic).difference(set(new_documents.Topic)) pd.testing.assert_frame_equal(old_documents, new_documents) pd.testing.assert_frame_equal(old_freq, new_freq)
def __init__(self, embedding_model, word_embedding_model): super().__init__() self.embedding_model = select_backend(embedding_model) self.word_embedding_model = select_backend(word_embedding_model)
def fit_transform(self, documents, embeddings=None, y=None): check_documents_type(documents) check_embeddings_shape(embeddings, documents) documents = pd.DataFrame({ "Document": documents, "ID": range(len(documents)), "Topic": None }) # Extract embeddings if embeddings is None: self.embedding_model = select_backend(self.embedding_model, language=self.language) embeddings = self._extract_embeddings(documents.Document, method="document", verbose=self.verbose) else: if self.embedding_model is not None: self.embedding_model = select_backend(self.embedding_model, language=self.language) # Reduce dimensionality with UMAP if self.seed_topic_list is not None and \ self.embedding_model is not None: y, embeddings = self._guided_topic_modeling(embeddings) umap_embeddings = self._reduce_dimensionality(embeddings, y) with open("berttopic_umapembeddings.npy", "wb") as f: np.save(f, umap_embeddings) # Cluster UMAP embeddings with HDBSCAN documents, probabilities = self._cluster_embeddings( umap_embeddings, documents) with open("berttopic_clusterobj.npy", "wb") as f: np.save(f, self.hdbscan_model) documents.to_parquet("berttopic_docs") with open("berttopic_probs.npy", "wb") as f: np.save(f, probabilities) # Sort and Map Topic IDs by their frequency if not self.nr_topics: documents = self._sort_mappings_by_frequency(documents) # Extract topics by calculating c-TF-IDF self._extract_topics(documents) # Reduce topics if self.nr_topics: documents = self._reduce_topics(documents) self._map_representative_docs(original_topics=True) probabilities = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() return predictions, probabilities