Ejemplo n.º 1
0
def test_hdbscan_membership_vector():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    vector = membership_vector(clusterer, np.array([[-1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.05705305, 0.05974177, 0.12228153]]))
    vector = membership_vector(clusterer, np.array([[1.5, -1.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.09462176, 0.32061556, 0.10112905]]))
    vector = membership_vector(clusterer, np.array([[0.0, 0.0]]))
    assert_array_almost_equal(vector,
                              np.array([[0.03545607, 0.03363318, 0.04643177]]))
Ejemplo n.º 2
0
    def transform(
            self, documents: Union[str,
                                   List[str]]) -> Tuple[List[int], np.ndarray]:
        """ After having fit a model, use transform to predict new instances

        Arguments:
            documents: A single document or a list of documents to fit on

        Returns:
            predictions: Topic predictions for each documents
            probabilities: The topic probability distribution
        """
        if isinstance(documents, str):
            documents = [documents]

        embeddings = self._extract_embeddings(documents)
        umap_embeddings = self.umap_model.transform(embeddings)
        probabilities = hdbscan.membership_vector(self.cluster_model,
                                                  umap_embeddings)
        predictions, _ = hdbscan.approximate_predict(self.cluster_model,
                                                     umap_embeddings)

        if self.mapped_topics:
            predictions = self._map_predictions(predictions)
            probabilities = self._map_probabilities(probabilities)

        if len(documents) == 1:
            probabilities = probabilities.flatten()

        return predictions, probabilities
Ejemplo n.º 3
0
 def classify_tracks(self):
     finder = CategoryFinder()
     finder.parse_categories("Classical", 5)
     sum_pct = 0
     for _id in finder.artists:
         artist = finder.artists[_id]
         recs = []
         for recording in artist['recordings']:
             recs.append(self.data.get_feature_vector(recording)['mfcc'])
         # test_labels, strengths = hd.approximate_predict(self.clusterer, recs)
         memb_vec = hd.membership_vector(self.clusterer, np.array(recs))
         classified = [np.argmax(v) for v in memb_vec]
         pct = [c for c in classified if c == 0 or c == 1]
         sum_pct += (float(len(pct)) / len(classified))
         print(artist['name'], float(len(pct)) / len(classified))
     print(sum_pct / len(finder.artists))
Ejemplo n.º 4
0
 def classify_artists(self):
     finder = CategoryFinder()
     finder.parse_categories("Classical", 5)
     aggrs = []
     names = []
     for _id in finder.artists:
         artist = self.data.get_doc(_id)
         artist["aggregates"] = self.data.aggregate_features(
             artist["recordings"])
         aggrs.append(artist["aggregates"]["median"])
         names.append(artist["name"])
     # test_labels, strengths = hd.approximate_predict(self.clusterer, aggrs)
     memb_vec = hd.membership_vector(self.clusterer, np.array(aggrs))
     classified = [np.argmax(v) for v in memb_vec]
     matched = [1 for a in classified if a == 0 or a == 1]
     for i, name in enumerate(names):
         # print(name, test_labels[i], strengths[i])
         print(name, classified[i])
     print(len(matched) / float(len(classified)))
Ejemplo n.º 5
0
def hdbscan_predict(embedding, df_scaled, clusterer, force_predict=True):
    if force_predict:
        mem_vec = pd.DataFrame(hdbscan.membership_vector(clusterer, embedding.values))
        test_labels = mem_vec.idxmax(axis=1).to_numpy()
        strengths = mem_vec.max(axis=1).to_numpy()
    else:
        test_labels, strengths = hdbscan.approximate_predict(clusterer, embedding)

    # Get probabilities
    scores = pd.DataFrame(strengths)
    scores.columns = ['score']

    # Get clusters
    labels = pd.DataFrame(test_labels)
    labels.columns = ['cluster']

    # Join
    scores = scores.join(labels).join(embedding).join(df_scaled)
    n_clusters = sum(scores['cluster'].unique()!=-1)
    scores['cluster'].value_counts()
    
    return(scores)
Ejemplo n.º 6
0
    def transform(self,
                  documents: Union[str, List[str]],
                  embeddings: np.ndarray = None) -> Tuple[List[int], np.ndarray]:
        """ After having fit a model, use transform to predict new instances

        Arguments:
            documents: A single document or a list of documents to fit on
            embeddings: Pre-trained document embeddings. These can be used
                        instead of the sentence-transformer model.

        Returns:
            predictions: Topic predictions for each documents
            probabilities: The topic probability distribution

        Usage:

        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups

        docs = fetch_20newsgroups(subset='all')['data']
        model = BERTopic("distilbert-base-nli-mean-tokens", verbose=True).fit(docs)
        topics = model.transform(docs)
        ```

        If you want to use your own embeddings:

        ```python
        from bertopic import BERTopic
        from sklearn.datasets import fetch_20newsgroups
        from sentence_transformers import SentenceTransformer

        # Create embeddings
        docs = fetch_20newsgroups(subset='all')['data']
        sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
        embeddings = sentence_model.encode(docs, show_progress_bar=True)

        # Create topic model
        model = BERTopic(None, verbose=True).fit(docs, embeddings)
        topics = model.transform(docs, embeddings)
        ```
        """
        if isinstance(documents, str):
            documents = [documents]

        if not isinstance(embeddings, np.ndarray):
            check_embeddings_shape(embeddings, documents)
            embeddings = self._extract_embeddings(documents)

        umap_embeddings = self.umap_model.transform(embeddings)
        probabilities = hdbscan.membership_vector(self.cluster_model, umap_embeddings)
        predictions, _ = hdbscan.approximate_predict(self.cluster_model, umap_embeddings)

        if self.mapped_topics:
            predictions = self._map_predictions(predictions)
            probabilities = self._map_probabilities(probabilities)

        if len(documents) == 1:
            probabilities = probabilities.flatten()

        return predictions, probabilities
Ejemplo n.º 7
0
 def membership_vector(self, X):
     return hdbscan.membership_vector(self.hdbscan, X)