예제 #1
0
def test_extract_topics_custom_cv():
    """ Test Topic Extraction with custom Countvectorizer

    Test whether topics could be extracted using c-TF-IDF.
    Checks are related to the existence of topic representation,
    not so much whether they make sense semantically.
    """
    nr_topics = 5
    documents = pd.DataFrame({"Document": newsgroup_docs,
                              "ID": range(len(newsgroup_docs)),
                              "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))})

    cv = CountVectorizer(ngram_range=(1, 2))
    model = BERTopic(vectorizer_model=cv)
    model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens")
    model._update_topic_size(documents)
    model._extract_topics(documents)
    freq = model.get_topic_freq()

    assert model.c_tf_idf.shape[0] == 5
    assert model.c_tf_idf.shape[1] > 100
    assert isinstance(freq, pd.DataFrame)
    assert nr_topics == len(freq.Topic.unique())
    assert freq.Count.sum() == len(documents)
    assert len(freq.Topic.unique()) == len(freq)
예제 #2
0
def test_topic_reduction(reduced_topics):
    """ Test Topic Reduction

    The the reduction of topics after having generated
    topics. This generation of the initial topics is done
    manually as the training takes quite a while.
    """
    nr_topics = reduced_topics + 2
    model = BERTopic(nr_topics=reduced_topics)
    model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens")
    old_documents = pd.DataFrame({"Document": newsgroup_docs,
                                  "ID": range(len(newsgroup_docs)),
                                  "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))})
    model._update_topic_size(old_documents)
    model._extract_topics(old_documents.copy())
    old_freq = model.get_topic_freq()

    new_documents = model._reduce_topics(old_documents.copy())
    new_freq = model.get_topic_freq()

    assert old_freq.Count.sum() == new_freq.Count.sum()
    assert len(old_freq.Topic.unique()) == len(old_freq)
    assert len(new_freq.Topic.unique()) == len(new_freq)
    assert isinstance(model.mapped_topics, dict)
    assert not set(model.get_topic_freq().Topic).difference(set(new_documents.Topic))
    assert model.mapped_topics
def test_topic_reduction(reduced_topics):
    """ Test Topic Reduction

    The the reduction of topics after having generated
    topics. This generation of the initial topics is done
    manually as the training takes quite a while.
    """
    nr_topics = reduced_topics + 2
    model = BERTopic(nr_topics=reduced_topics)
    model.embedding_model = select_backend("all-MiniLM-L6-v2")
    topics = np.random.randint(-1, nr_topics - 1, len(newsgroup_docs))
    old_documents = pd.DataFrame({
        "Document": newsgroup_docs,
        "ID": range(len(newsgroup_docs)),
        "Topic": topics
    })
    model.hdbscan_model.labels_ = topics
    model.topic_mapper = TopicMapper(model.hdbscan_model)
    model._update_topic_size(old_documents)
    old_documents = model._sort_mappings_by_frequency(old_documents)
    model._extract_topics(old_documents.copy())
    old_freq = model.get_topic_freq()

    new_documents = model._reduce_topics(old_documents.copy())
    new_freq = model.get_topic_freq()

    assert old_freq.Count.sum() == new_freq.Count.sum()
    assert len(old_freq.Topic.unique()) == len(old_freq)
    assert len(new_freq.Topic.unique()) == len(new_freq)
    assert not set(model.get_topic_freq().Topic).difference(
        set(new_documents.Topic))
def test_extract_topics():
    """ Test Topic Extraction

    Test whether topics could be extracted using c-TF-IDF.
    Checks are related to the existence of topic representation,
    not so much whether they make sense semantically.
    """
    nr_topics = 5
    documents = pd.DataFrame({
        "Document":
        newsgroup_docs,
        "ID":
        range(len(newsgroup_docs)),
        "Topic":
        np.random.randint(-1, nr_topics - 1, len(newsgroup_docs))
    })
    model = BERTopic()
    model.embedding_model = select_backend("all-MiniLM-L6-v2")
    model._update_topic_size(documents)
    model._extract_topics(documents)
    freq = model.get_topic_freq()

    assert model.c_tf_idf.shape[0] == 5
    assert model.c_tf_idf.shape[1] > 100
    assert isinstance(freq, pd.DataFrame)
    assert nr_topics == len(freq.Topic.unique())
    assert freq.Count.sum() == len(documents)
    assert len(freq.Topic.unique()) == len(freq)
def test_topic_reduction_edge_cases():
    """ Test Topic Reduction Large Nr Topics

    Test whether the topics are not reduced if the reduced number
    of topics exceeds the actual number of topics found
    """
    nr_topics = 5
    topics = np.random.randint(-1, nr_topics - 1, len(newsgroup_docs))
    model = BERTopic()
    model.embedding_model = select_backend("all-MiniLM-L6-v2")
    model.nr_topics = 100
    model.hdbscan_model.labels_ = topics
    old_documents = pd.DataFrame({
        "Document": newsgroup_docs,
        "ID": range(len(newsgroup_docs)),
        "Topic": topics
    })
    model._update_topic_size(old_documents)
    model._extract_topics(old_documents)
    old_freq = model.get_topic_freq()

    new_documents = model._reduce_topics(old_documents)
    new_freq = model.get_topic_freq()

    assert not set(old_documents.Topic).difference(set(new_documents.Topic))
    pd.testing.assert_frame_equal(old_documents, new_documents)
    pd.testing.assert_frame_equal(old_freq, new_freq)
예제 #6
0
def test_extract_embeddings_compare():
    """ Test SentenceTransformer with BERTopic

    Test if the correct embedding model is loaded in BERTopic and
    whether BERTopic embeddings match the sentence-transformers embeddings.
    """
    docs = ["some document"]
    model = BERTopic(embedding_model="all-MiniLM-L6-v2")
    model.embedding_model = select_backend("all-MiniLM-L6-v2")
    bertopic_embeddings = model._extract_embeddings(docs)

    assert isinstance(bertopic_embeddings, np.ndarray)
    assert bertopic_embeddings.shape == (1, 384)

    sentence_embeddings = embedding_model.encode(docs, show_progress_bar=False)
    assert np.array_equal(bertopic_embeddings, sentence_embeddings)
예제 #7
0
def test_extract_embeddings(base_bertopic):
    """ Test SentenceTransformer

    Check whether the embeddings are correctly generated
    for both a single string or a list of strings. This means that
    the correct shape should be outputted. The embeddings by itself
    should not exceed certain values as a sanity check.
    """
    base_bertopic.embedding_model = select_backend("all-MiniLM-L6-v2")
    single_embedding = base_bertopic._extract_embeddings("a document")
    multiple_embeddings = base_bertopic._extract_embeddings(["a document", "another document"])

    assert single_embedding.shape[0] == 1
    assert single_embedding.shape[1] == 384
    assert np.min(single_embedding) > -5
    assert np.max(single_embedding) < 5

    assert multiple_embeddings.shape[0] == 2
    assert multiple_embeddings.shape[1] == 384
    assert np.min(multiple_embeddings) > -5
    assert np.max(multiple_embeddings) < 5
예제 #8
0
def test_topic_reduction_edge_cases():
    """ Test Topic Reduction Large Nr Topics

    Test whether the topics are not reduced if the reduced number
    of topics exceeds the actual number of topics found
    """
    model = BERTopic()
    model.embedding_model = select_backend("distilbert-base-nli-stsb-mean-tokens")
    nr_topics = 5
    model.nr_topics = 100
    old_documents = pd.DataFrame({"Document": newsgroup_docs,
                                  "ID": range(len(newsgroup_docs)),
                                  "Topic": np.random.randint(-1, nr_topics-1, len(newsgroup_docs))})
    model._update_topic_size(old_documents)
    model._extract_topics(old_documents)
    old_freq = model.get_topic_freq()

    new_documents = model._reduce_topics(old_documents)
    new_freq = model.get_topic_freq()

    assert not set(old_documents.Topic).difference(set(new_documents.Topic))
    pd.testing.assert_frame_equal(old_documents, new_documents)
    pd.testing.assert_frame_equal(old_freq, new_freq)
예제 #9
0
    def __init__(self, embedding_model, word_embedding_model):
        super().__init__()

        self.embedding_model = select_backend(embedding_model)
        self.word_embedding_model = select_backend(word_embedding_model)
예제 #10
0
    def fit_transform(self, documents, embeddings=None, y=None):

        check_documents_type(documents)
        check_embeddings_shape(embeddings, documents)

        documents = pd.DataFrame({
            "Document": documents,
            "ID": range(len(documents)),
            "Topic": None
        })

        # Extract embeddings
        if embeddings is None:
            self.embedding_model = select_backend(self.embedding_model,
                                                  language=self.language)
            embeddings = self._extract_embeddings(documents.Document,
                                                  method="document",
                                                  verbose=self.verbose)
        else:
            if self.embedding_model is not None:
                self.embedding_model = select_backend(self.embedding_model,
                                                      language=self.language)

        # Reduce dimensionality with UMAP
        if self.seed_topic_list is not None and \
                self.embedding_model is not None:
            y, embeddings = self._guided_topic_modeling(embeddings)
        umap_embeddings = self._reduce_dimensionality(embeddings, y)

        with open("berttopic_umapembeddings.npy", "wb") as f:
            np.save(f, umap_embeddings)

        # Cluster UMAP embeddings with HDBSCAN
        documents, probabilities = self._cluster_embeddings(
            umap_embeddings, documents)

        with open("berttopic_clusterobj.npy", "wb") as f:
            np.save(f, self.hdbscan_model)

        documents.to_parquet("berttopic_docs")

        with open("berttopic_probs.npy", "wb") as f:
            np.save(f, probabilities)

        # Sort and Map Topic IDs by their frequency
        if not self.nr_topics:
            documents = self._sort_mappings_by_frequency(documents)

        # Extract topics by calculating c-TF-IDF
        self._extract_topics(documents)

        # Reduce topics
        if self.nr_topics:
            documents = self._reduce_topics(documents)

        self._map_representative_docs(original_topics=True)
        probabilities = self._map_probabilities(probabilities,
                                                original_topics=True)
        predictions = documents.Topic.to_list()

        return predictions, probabilities