Esempio n. 1
0
def test_hdbscan_approximate_predict():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]]))
    assert_equal(cluster, 2)
    cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]]))
    assert_equal(cluster, 1)
    cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]]))
    assert_equal(cluster, -1)
Esempio n. 2
0
def test_hdbscan_approximate_predict():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    cluster, prob = approximate_predict(clusterer, np.array([[-1.5, -1.0]]))
    assert cluster == 2
    cluster, prob = approximate_predict(clusterer, np.array([[1.5, -1.0]]))
    assert cluster == 1
    cluster, prob = approximate_predict(clusterer, np.array([[0.0, 0.0]]))
    assert cluster == -1
Esempio n. 3
0
    def transform(
            self, documents: Union[str,
                                   List[str]]) -> Tuple[List[int], np.ndarray]:
        """ After having fit a model, use transform to predict new instances

        Arguments:
            documents: A single document or a list of documents to fit on

        Returns:
            predictions: Topic predictions for each documents
            probabilities: The topic probability distribution
        """
        if isinstance(documents, str):
            documents = [documents]

        embeddings = self._extract_embeddings(documents)
        umap_embeddings = self.umap_model.transform(embeddings)
        probabilities = hdbscan.membership_vector(self.cluster_model,
                                                  umap_embeddings)
        predictions, _ = hdbscan.approximate_predict(self.cluster_model,
                                                     umap_embeddings)

        if self.mapped_topics:
            predictions = self._map_predictions(predictions)
            probabilities = self._map_probabilities(probabilities)

        if len(documents) == 1:
            probabilities = probabilities.flatten()

        return predictions, probabilities
Esempio n. 4
0
 def predictCellmlsCluster(self, cellmls=None):
     documentations = self.__getCellmlsDocumentation(cellmls)
     cellmlUrls, cellmlDocs = zip(*documentations.items())
     tfidf = self.__calcuateTfidf(cellmlDocs)
     test_labels, strengths = hdbscan.approximate_predict(
         self.clusterer, tfidf.A)
     return test_labels
def ClusterOneMonthData(month, name):
    ## Month should be an integer (0 = Apr_2016, 8= dec_2016, 9 = jan_2017, 13 = May_2017)
    ## Name should be a string, with which we should save the dataframe
    os.chdir('C:/Users/tpaulraj/Projects/Clustering/features/')  #Windows
    One_Month_Data = ReadMonthlyData(
        pattern_list[month])  #Reading one month data using the pattern list
    IL_One_Month_Data, monthly_data_mask = RemoveOutliers(
        One_Month_Data, pca_final, th1, th2)  #Removing outliers from
    ##one month data
    labels, strengths = hdbscan.approximate_predict(
        hdb, IL_One_Month_Data)  # predicting labels and strengths
    ## for one month

    ##Writing the two PCs, labels and strengths into a dataframe and storing it as hdf5 file
    os.chdir('C:/Users/tpaulraj/Projects/Clustering/Results/')
    pca_dataframe = pd.DataFrame(IL_One_Month_Data)
    pca_dataframe['labels'] = labels
    pca_dataframe['strengths'] = strengths
    pca_dataframe.columns = ['pca1', 'pca2', 'labels', 'strengths']
    pca_dataframe.to_hdf(path_or_buf='{}_clusters'.format(name),
                         key='pca_dataframe')
    One_Month_Data.iloc[monthly_data_mask].to_csv(
        path_or_buf='{}_feature_data'.format(name), header=True, index=True)
    del (IL_One_Month_Data, One_Month_Data, labels, monthly_data_mask,
         pca_dataframe, strengths)
def hdbscan_see(X, test_bi_pca, labels, min_samples):
    result = []
    for mSample in min_samples:
        clusterer = hdbscan.HDBSCAN(min_cluster_size=mSample,
                                    prediction_data=True,
                                    gen_min_span_tree=True).fit(X)

        test_labels, strengths = hdbscan.approximate_predict(
            clusterer, test_bi_pca)
        test_labels[test_labels > -1] = 0
        print(mSample)
        print(data_utils.show_performance(labels, test_labels))
        # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, clusterer.labels_))
        # print("Calinski-Harabaz Index: %0.3f" % metrics.calinski_harabaz_score(X, clusterer.labels_))
        print('--')
        # clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis',
        #                                       edge_alpha=0.6,
        #                                       node_size=80,
        #                                       edge_linewidth=2)
        # clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
        # clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())
        # result.append(metrics.silhouette_score(X, clusterer.labels_))
    plt.plot(min_samples, result)
    plt.xlabel('min_samples')
    plt.ylabel('Silhouette Coefficient')
def recommend_co_merchants_hdb(df, lat, long, city, merchant, cluster_object):
    # Predict the cluster for longitude and latitude provided
    test_labels, strengths = hdbscan.approximate_predict(
        cluster_object, [[lat, long]])
    predicted_cluster = test_labels[0]
    print('Predicted cluster for this lat/long combination is: ' +
          str(predicted_cluster))
    print(
        "_______________________________________________________________________________"
    )

    if predicted_cluster == -1:
        return ('No merchants close by')
    # Get the best merchant in this cluster
    else:
        pop_merch_recomm_df = (
            df[df['cluster'] == predicted_cluster].iloc[0:5][[
                'merchant', 'city', 'latitude', 'longitude'
            ]])
        pop_merch_recomm_df = pop_merch_recomm_df.reset_index(drop=True)
        mask = (pop_merch_recomm_df.merchant==merchant) & (pop_merch_recomm_df.latitude==lat) & \
               (pop_merch_recomm_df.longitude==long)
        print ('Since you are currently in '+ city.capitalize() + ' ' + 'at ' + \
               merchant.capitalize() + ', how about you visit these merchants around this area? ')
    return pop_merch_recomm_df[~mask]
def visualize(data, text, clusterer, cluster_by, whitespace_only=False):
    ''' Visualize appearances of each cluster in the sample text.
    'data' should be output like that from get_comp_data'''

    # Get data for text to use for clustering:
    tokens = [unidecode.unidecode(char) for char in text]
    tokens = [ct.tweak_whitespace(w) for w in tokens[0:-1]]

    print("Starting labelling")
    labels, _ = hdbscan.approximate_predict(clusterer, data[cluster_by])
    clusters = sorted(list(set(labels)))
    print("Done labelling")

    content = dt.div(dt.h1("Cluster Visualization"))
    for i, c in enumerate(clusters):
        print("DEBUG: - Visualizing cluster %d" % i)
        details = dt.details(dt.summary("Cluster: %d" % c))
        colors = [1.0 if c == labels[j] else 0.0 for j in range(len(tokens))]
        if whitespace_only:
            for j in range(len(tokens)):
                colors[j] = colors[j] if text[j] in [" ", "\n"] else 0.0
        token_data = zip(tokens, colors)
        details.add(ct.colored_text(token_data))
        content.add(details)
    return content
Esempio n. 9
0
 def get_clustering(self, attributes_2D_mapping):
     """Returns HDBSCAN cluster labels
     """
     assert attributes_2D_mapping.shape[1] == 2
     new_labels = hdbscan.approximate_predict(
         self.clusterer, attributes_2D_mapping
     )
     return new_labels
Esempio n. 10
0
    def predict(self, embeddings: np.ndarray):
        if not self.is_fitted:
            return Clusterer._empty_assignment(len(embeddings))

        embeddings_umap = self.umap.transform(embeddings)
        labels, probabilities = hdbscan.approximate_predict(
            self.hdbscan, embeddings_umap)

        return ClusterAssignment(labels=labels, probabilities=probabilities)
Esempio n. 11
0
def assign_samples(chunk, X, y, model, scale, chunk_size, values=False):
    """Runs a models assignment on a chunk of input

    Args:
        chunk (int)
            Index of chunk to process
        X (NumpyShared)
            n x 2 array of core and accessory distances for n samples
        y (NumpyShared)
            An n-vector to store results, with the most likely cluster memberships
            or an n by k matrix with the component responsibilities for each sample.
        weights (numpy.array)
            Component weights from :class:`~PopPUNK.models.BGMMFit`
        means (numpy.array)
            Component means from :class:`~PopPUNK.models.BGMMFit`
        covars (numpy.array)
            Component covariances from :class:`~PopPUNK.models.BGMMFit`
        scale (numpy.array)
            Scaling of core and accessory distances from :class:`~PopPUNK.models.BGMMFit`
        chunk_size (int)
            Size of each chunk in X
        values (bool)
            Whether to return the responsibilities, rather than the most
            likely assignment (used for entropy calculation).

            Default is False
    """
    # Make sure this is run single threaded
    with set_env(MKL_NUM_THREADS='1',
                 NUMEXPR_NUM_THREADS='1',
                 OMP_NUM_THREADS='1'):
        if isinstance(X, NumpyShared):
            X_shm = shared_memory.SharedMemory(name=X.name)
            X = np.ndarray(X.shape, dtype=X.dtype, buffer=X_shm.buf)
        if isinstance(y, NumpyShared):
            y_shm = shared_memory.SharedMemory(name=y.name)
            y = np.ndarray(y.shape, dtype=y.dtype, buffer=y_shm.buf)

        start = chunk * chunk_size
        end = min((chunk + 1) * chunk_size, X.shape[0])
        if start >= end:
            raise RuntimeError("start >= end in BGMM assign")

        if isinstance(model, BGMMFit):
            logprob, lpr = log_likelihood(X[start:end, :], model.weights,
                                          model.means, model.covariances,
                                          scale)
            responsibilities = np.exp(lpr - logprob[:, np.newaxis])
            # Default to return the most likely cluster
            if values == False:
                y[start:end] = responsibilities.argmax(axis=1)
            # Can return the actual responsibilities
            else:
                y[start:end, :] = responsibilities
        elif isinstance(model, DBSCANFit):
            y[start:end] = hdbscan.approximate_predict(
                model.hdb, X[start:end, :] / scale)[0]
Esempio n. 12
0
 def get_result(self):
     points = [[
         self.age, self.incomeneed, self.riskpropension,
         self.protectionneed, self.inheritanceindex
     ]]
     print("My Points: ", points)
     labels, streghts = hdbscan.approximate_predict(hdb_cluster, points)
     print("Predictions: ", labels[0])
     global RS
     RS = labels[0]
Esempio n. 13
0
def hdbscan_segmentation(embedding,
                         n_img_dims=None,
                         coord_scales=None,
                         metric='euclidean',
                         min_cluster_size=50,
                         slice_for_fit=None,
                         **hdbscan_kwargs):
    assert hdbscan is not None, 'need hdbscan for hdbscan_segmentation'
    assert metric in hdbscan.dist_metrics.METRIC_MAPPING
    if n_img_dims is None:
        # default: assume one embedding image is being passed
        n_img_dims = len(embedding.shape) - 1
    emb_shape = embedding.shape
    img_shape = emb_shape[-n_img_dims:]

    # append image coordinates as features if requested
    if coord_scales is not None:
        if not isinstance(coord_scales, collections.Iterable):
            coord_scales = n_img_dims * (coord_scales, )
        assert len(coord_scales) == n_img_dims, f'{coord_scales}, {n_img_dims}'
        embedding = _append_coords(embedding, coord_scales)

    # compute #pixels per image
    n_pixels = 1
    for s in img_shape:
        n_pixels *= s

    # reshape embedding for clustering
    embedding = embedding.contiguous().view(-1,
                                            embedding.shape[-n_img_dims - 1],
                                            n_pixels).permute(0, 2, 1)

    # init HDBSCAN clusterer
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                metric=metric,
                                **hdbscan_kwargs)

    # iterate over images in batch
    result = []
    for emb in embedding:
        if slice_for_fit is not None:
            clusterer.fit(
                emb.view(*img_shape, -1)[slice_for_fit].contiguous().view(
                    -1, emb.shape[-1]))
            clusterer.generate_prediction_data()
            labels = hdbscan.approximate_predict(clusterer,
                                                 emb).reshape(img_shape)
        else:
            labels = clusterer.fit_predict(emb).reshape(img_shape)
        result.append(labels)

    result = np.stack(result, axis=0).reshape(emb_shape[:-n_img_dims - 1] +
                                              emb_shape[-n_img_dims:])
    return torch.from_numpy(result)
Esempio n. 14
0
def HDBScan_clustering(data, test, columns, minimum_cluster_size=3000):
    """
    Clustering function using HDBscan. The function will create a new columns in the train and test set dataframes.

    data - train dataframe to be to performed clustering and then create a new column with the cluster identification
    columns - columns to perform clustering
    
    test - test dataframe where it will be created a new column with the cluster identification based on the training set clustering
    """

    clusterer = hdbscan.HDBSCAN(min_cluster_size=minimum_cluster_size, prediction_data=True).fit(data[columns])
    
    train_labels, strengths = hdbscan.approximate_predict(clusterer, data[columns])
    test_labels, strengths = hdbscan.approximate_predict(clusterer, test[columns])
    
    print('Number of clusters in training set using HDBScan: {}'.format(len(np.unique(train_labels))))
    print('Number of clusters in test set using HDBScan: {}'.format(len(np.unique(train_labels))))

    data['HDBScan'] = train_labels
    test['HDBScan'] = test_labels
Esempio n. 15
0
    def predict(self, text):
        """ Predict the cluster for the input text

        :return tuple: label and stength
        """
        labels, strengths = hdbscan.approximate_predict(
            self.model, [
                self.encoder.encode(
                    configuration.DEFAULT_TOKENIZER.transform(text))
            ])
        return int(labels[0]), float(strengths[0])
Esempio n. 16
0
    def _cluster_test(hdbscan, cluster_centers, df):
        coords = df[['latitude', 'longitude']] * np.pi / 180

        df = df.assign(cluster=approximate_predict(hdbscan, coords)[0])
        df = df.merge(cluster_centers,
                      left_on='cluster',
                      right_index=True,
                      how='left',
                      suffixes=('', '_cluster'))

        return df
Esempio n. 17
0
 def learn(self, personas, keywordsId, answers, keywordsCond):
     i = 0
     for persona in personas:
         embed = concatEmbeddingEn(
             getContextualEmbedding(persona, verbose=True))
         df2 = pd.DataFrame(embed[0])
         df2 = tools.Compressor.compressVectorDfdim1Todim2(
             df2, self.compressor)
         df2 = df2.rename(columns=str)
         df2['word'] = [s.replace("</w>", "") for s in embed[1]]
         sentences = []
         doc = ' '.join(embed[1])
         h = 0
         windows_size = 20
         print(doc, flush=True)
         print(embed[1], flush=True)
         for word in embed[1]:
             sentences.append(' '.join(
                 embed[1][max(0, h -
                              windows_size):min(len(embed[1]), h +
                                                windows_size)]))
             h += 1
         print("SENTENCES", flush=True)
         print(sentences, flush=True)
         df2['sentence'] = sentences
         df2 = df2[~df2.word.isin(stopwords.words('english'))]
         data_formatted = []
         for col in df2.columns:
             if col != "word" and col != "sentence":
                 data_formatted.append(df2[col].tolist())
         data = np.array(data_formatted[0:32]).T
         print("TO LEARN", flush=True)
         print(data, flush=True)
         print(data.shape, flush=True)
         #self.hdbscan_model.fit(data)
         labels, _ = hdbscan.approximate_predict(self.hdbscan_model, data)
         df2['clusterid'] = labels
         df2['keywordsId'] = [
             keywordsId[i].split('|') for l in range(len(df2))
         ] if len(keywordsId[i].split('|')) > 0 else []
         df2['keywordsCond'] = [
             keywordsCond[i].split('|') for l in range(len(df2))
         ] if len(keywordsCond[i].split('|')) > 0 else []
         df2['answer'] = [answers[i] for l in range(len(df2))]
         print(df2.head(), flush=True)
         print(df2.columns, flush=True)
         print(self.dfWiki.columns, flush=True)
         print(self.dfWiki, flush=True)
         self.dfWiki = pd.concat([self.dfWiki, df2]).reset_index(drop=True)
         print(self.dfWiki)
         print("TAIL", flush=True)
         print(self.dfWiki.tail(10), flush=True)
         i += 1
def hdb_cluster(data, min_s_ratio, min_clust_ratio, f_save=None,
                num_cores=1, min_s_num=None, min_clust_num=None,
                clust_select='eom', max_samps=100000):
    n_obs = data.shape[0]
    print(n_obs)
    min_samples = int(n_obs/min_s_ratio)
    min_cluster_size = int(n_obs/min_clust_ratio)

    if min_s_num is not None:
        min_samples = min_s_num
    if min_clust_num is not None:
        min_cluster_size = min_clust_num

    print('cluster method:', clust_select)

    # if n_subsamp != None:
    #     data.sample
    # Assume needs to have

    if data.shape[0] > max_samps:
        print("Subsampling")
        full_data = data.copy()
        data = data[np.random.choice(data.shape[0], max_samps, replace=False), :]
        min_samples = int(data.shape[0] / min_s_ratio)
        min_cluster_size = int(data.shape[0] / min_clust_ratio)
        clusterer = hdbscan.HDBSCAN(min_samples=min_samples,
                                 min_cluster_size=min_cluster_size,
                                 core_dist_n_jobs=num_cores,
                                 cluster_selection_method=clust_select,
                                 prediction_data=True
                                 ).fit(data)

        labels, _ = hdbscan.approximate_predict(clusterer, full_data)
        print(labels)
        if f_save is not None:
            f_save = f_save.replace(".p", "") + ".p"
            pickle.dump(labels, open(f_save, "wb"))
        return labels

    print("Number of cores", num_cores)
    if num_cores is not None and num_cores != 1:
        labels = hdbscan.HDBSCAN(min_samples=min_samples,
                                 min_cluster_size=min_cluster_size,
                                 core_dist_n_jobs=num_cores,cluster_selection_method=clust_select
                                 ).fit_predict(data)
    else:
        labels = hdbscan.HDBSCAN(min_samples=min_samples,
                                 min_cluster_size=min_cluster_size,cluster_selection_method=clust_select
                                 ).fit_predict(data)
    if f_save is not None:
        f_save = f_save.replace(".p", "") + ".p"
        pickle.dump(labels, open(f_save, "wb"))
    return labels
Esempio n. 19
0
    def predict(self, attributes0, thresh=0.75):

        attributes = np.array(attributes0)

        pick_clusters = self.comparison_summary[
            self.comparison_summary['dif'] >= thresh].index.tolist()

        res, cluster_strengths = hdbscan.approximate_predict(
            self.clustering_model, attributes)

        return (pd.Series(res).isin(pick_clusters).astype(int).values,
                cluster_strengths)
Esempio n. 20
0
    def __predict(self):

        #====================================================
        #==   CHECK DATA & MODEL
        #====================================================
        # - Check if data are set
        if self.data is None:
            logger.error("Input data array is None!")
            return -1

        # - Check if clustering model is set
        if self.clusterer is None:
            logger.error("Clusterer is not set!")
            return -1

        # - Retrieve prediction data from current model
        logger.info(
            "Retrieving prediction data from current model (if any) ...")
        self.prediction_data = self.clusterer.prediction_data_

        #====================================================
        #==   CLUSTER DATA USING SAVED MODEL
        #====================================================
        logger.info("Encode input data using loaded model ...")
        self.labels, self.probs = hdbscan.approximate_predict(
            self.clusterer, self.data)

        #================================
        #==   SAVE CLUSTERED DATA
        #================================
        logger.info("Saving unsupervised encoded data to file ...")
        N = self.data.shape[0]
        print("Cluster data N=", N)

        snames = np.array(self.source_names).reshape(N, 1)
        objids = np.array(self.data_classids).reshape(N, 1)
        clustered_data = np.concatenate(
            (snames, objids, self.labels, self.probs), axis=1)

        head = "# sname id clustid clustprob"
        Utils.write_ascii(clustered_data, self.outfile, head)

        #================================
        #==   PLOT
        #================================
        logger.info("Plotting results ...")
        self.__plot_predict(self.clusterer, self.data, self.labels,
                            self.source_names, self.data_labels,
                            self.prediction_data, self.prediction_extra_data,
                            self.outfile_plot)

        return 0
Esempio n. 21
0
    def transform(self, documents: Union[str, List[str]]) -> List[int]:
        """ After having fit a model, use transform to predict new instances """
        if isinstance(documents, str):
            documents = [documents]

        embeddings = self._extract_embeddings(documents)
        umap_embeddings = self.umap_model.transform(embeddings)
        predictions, strengths = hdbscan.approximate_predict(self.cluster_model, umap_embeddings)

        if self.mapped_topics:
            predictions = self._map_predictions(predictions)

        return predictions
Esempio n. 22
0
def predict_news_cluster_by_date(nrows, date):
    last_script_execution = db.get_last_script_execution(script_name)
    last_processed_date = (
        last_script_execution["last_processed_date"].values[0]
        if last_script_execution is not None
        else None
    )

    # Load only news articles since the last batch
    news_articles = db.get_news_articles_from_startdate_to_enddate(str(last_processed_date), str(date), nrows)
    data_matrix = vectorizer.transform(news_articles["text_lemmatized_without_stopwords"])
    labels, probabilities = approximate_predict(model, data_matrix.toarray())

    # Clusters are identified by a sorted string of news ids
    return labels
Esempio n. 23
0
    def hdbscan(self, inclusion_threshold: float or None = None):
        """
        Perform gating with HDBSCAN algorithm
        (https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html)

        HDBSCAN clustering is performed on either the whole dataset or a sample of the dataset if specified.
        If clustering is performed on a sample, a call to 'approximate_predict' is made for remaining data.
        (https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan.prediction.approximate_predict)

        Parameters
        -----------
        inclusion_threshold: float, optional
            float value for minimum probability threshold for data inclusion; data below this
            threshold will be classed as noise

        Returns
        --------
        ChildPopulationCollection
            Updated child populations with events indexing complete
        """
        sample = None
        # If parent is empty just return the child populations with empty index array
        if self.empty_parent:
            return self.child_populations
        if self.frac is not None:
            sample = self.sampling(self.data, 40000)
        # Cluster!
        model = hdbscan.HDBSCAN(core_dist_n_jobs=-1,
                                min_cluster_size=self.min_pop_size,
                                prediction_data=True)
        if sample is not None:
            model.fit(sample[[self.x, self.y]])
            self.data['labels'], self.data[
                'label_strength'] = hdbscan.approximate_predict(
                    model, self.data[[self.x, self.y]])
        else:
            model.fit(self.data[[self.x, self.y]])
            self.data['labels'] = model.labels_
            self.data['label_strength'] = model.probabilities_
        # Post clustering checks
        if inclusion_threshold is not None:
            mask = self.data['label_strength'] < inclusion_threshold
            self.data.loc[mask, 'labels'] = -1
        # Predict clusters for child populations
        polygon_shapes = self.generate_polygons()
        population_predictions = self._predict_pop_clusters(polygon_shapes)
        return self._assign_clusters(population_predictions, polygon_shapes)
Esempio n. 24
0
def assign_samples_dbscan(X, hdb, scale):
    """Use a fitted dbscan model to assign new samples to a cluster

    Args:
        X (numpy.array)
            N x 2 array of core and accessory distances
        hdb (hdbscan.HDBSCAN)
            Fitted DBSCAN from hdbscan package
        scale (numpy.array)
            Scale factor of model object

    Returns:
        y (numpy.array)
            Cluster assignments by sample
    """
    y = hdbscan.approximate_predict(hdb, X/scale)[0]
    return y
Esempio n. 25
0
 def classify_test(self):
     with open('../data/classical-artists.ids') as f:
         flat = f.read()
         f.close()
     aggrs = []
     names = []
     for _id in flat.split("\n"):
         doc = self.get_doc(_id)
         if not doc is None:
             if len(doc["recordings"].keys()) < doc["track_count"]:
                 doc["track_count"] = len(doc["recordings"].keys())
             aggrs.append(
                 self.aggregate_features(doc["recordings"])["median"])
             names.append(doc["name"])
     test_labels, strengths = hd.approximate_predict(self.clusterer, aggrs)
     for i, name in enumerate(names):
         print(name, test_labels[i], strengths[i])
Esempio n. 26
0
def test_approx_predict_default():
    """
    Verify that approximate_predict_flat produces same results as default
    """
    # Given the base HDBSCAN trained on some data,
    clusterer = HDBSCAN(cluster_selection_method='eom',
                        cluster_selection_epsilon=0,
                        prediction_data=True).fit(X)

    # When using approximate_predict_flat without specifying n_clusters,
    labels_flat, proba_flat = approximate_predict_flat(
                                    clusterer, X_test, n_clusters=None)

    # Then, the clustering should match that due to approximate_predict,
    labels_base, proba_base = approximate_predict(clusterer, X_test)
    assert_array_equal(labels_flat, labels_base)
    assert_array_equal(proba_flat, proba_base)
    return
Esempio n. 27
0
def predict_new_points(test_dataset, clusterer, mrs):
    test_bi_pca_all = data_utils.get_test_transformed(test_dataset, mrs)

    labels = test_bi_pca_all[['label']]
    test_bi_pca = test_bi_pca_all.drop(['label'], axis=1)

    # see what happened
    ot = test_bi_pca_all[test_bi_pca_all['label'] == -1]
    plt.scatter(ot[['pca_1']], ot[['pca_2']], s=50, linewidth=0, c='yellow', alpha=1, label='Test outliers')
    noot = test_bi_pca_all[test_bi_pca_all['label'] != -1]
    plt.scatter(noot[['pca_1']], noot[['pca_2']], s=50, linewidth=0, c='blue', alpha=1, label='Test data points')
    legend = plt.legend(loc='upper left')
    legend.legendHandles[2]._sizes = [30]
    legend.legendHandles[3]._sizes = [40]
    #
    test_labels, strengths = hdbscan.approximate_predict(clusterer, test_bi_pca)
    test_labels [test_labels > -1] = 0
    sensitivity, specificity, accuracy = data_utils.show_performance(labels, test_labels)
    return sensitivity, specificity, accuracy
Esempio n. 28
0
def test_hdbscan(data):
    cluster_loaded = load_model()
    labels, strengths = hdbscan.approximate_predict(cluster_loaded, data)
    print(labels, strengths)

    # db = DBSCAN(eps=0.25, min_samples=30)
    # db.fit(data)
    # labels = db.labels_
    #get db count
    cluster_groups = {}
    for i in labels:
        if cluster_groups.get(i):
            cluster_groups[i] = cluster_groups[i] + 1
        else:
            cluster_groups[i] = 1
    print("cluster_groups", cluster_groups)

    # Number of clusters in labels, ignoring noise if present
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    return labels, cluster_groups
Esempio n. 29
0
def predict(input_string):
    corpus = pickle.load((open('corpus.sav', 'rb')))
    lda_model = gensim.models.LdaMulticore(corpus=corpus)
    lda_model.load("lda.model")
    reducer = pickle.load((open('reducer.sav', 'rb')))
    cluster = pickle.load((open('cluster.sav', 'rb')))

    df2 = pd.DataFrame([input_string], columns={'message'})
    df2['message'] = df2['message'].apply(lambda x: list(x.split('         ')))
    df2['token'] = df2['message'].apply(sent_to_words)
    df2['token'] = df2['message'].apply(remove_stopwords)
    df2['token'] = df2['token'].apply(lemmatization)
    texts2 = df2.token.values
    bigram2 = make_bigrams(texts2)
    id2word2 = gensim.corpora.Dictionary(bigram2)
    id2word2.compactify()
    corpus2 = [id2word2.doc2bow(text) for text in bigram2]
    #Assign topics based on optimum number of topics
    df2[0, 'topic'] = sorted(lda_model[corpus2[0]],
                             reverse=True,
                             key=lambda x: x[1])[0][0]
    df2[0, 'topic_probability'] = sorted(lda_model[corpus2[0]],
                                         reverse=True,
                                         key=lambda x: x[1])[0][1]

    #Get the word embeddings
    sentence_embeddings = get_word_embedding(df2.loc[0, 'token'])

    #Split the tensor into 768 columns for clustering
    df_new = pd.DataFrame(columns=[i for i in range(768)], index=[0])
    for j in range(768):
        df_new.iloc[0, j] = sentence_embeddings[j]

    #Concat the tensors with the original dataframe
    df2 = pd.concat([df2, df_new], axis=1)
    #Filter out LDA and word embedding
    embeddings = df2.iloc[:, 4:]
    chat_embeddings = embeddings.iloc[0, :][None, :]
    test_data = reducer.transform(chat_embeddings)
    test_labels, test_prob = hdbscan.approximate_predict(cluster, test_data)
    return test_labels, test_prob
Esempio n. 30
0
def hdbscan_predict(embedding, df_scaled, clusterer, force_predict=True):
    if force_predict:
        mem_vec = pd.DataFrame(hdbscan.membership_vector(clusterer, embedding.values))
        test_labels = mem_vec.idxmax(axis=1).to_numpy()
        strengths = mem_vec.max(axis=1).to_numpy()
    else:
        test_labels, strengths = hdbscan.approximate_predict(clusterer, embedding)

    # Get probabilities
    scores = pd.DataFrame(strengths)
    scores.columns = ['score']

    # Get clusters
    labels = pd.DataFrame(test_labels)
    labels.columns = ['cluster']

    # Join
    scores = scores.join(labels).join(embedding).join(df_scaled)
    n_clusters = sum(scores['cluster'].unique()!=-1)
    scores['cluster'].value_counts()
    
    return(scores)
    def __init__(self, qf, df, labels, verbose=True):
        self.cluster = hdbscan.HDBSCAN(min_cluster_size=10,
                                       prediction_data=True).fit(qf)
        clusids, strengths = hdbscan.approximate_predict(self.cluster, qf)
        uniques = np.sort(np.unique(clusids))
        n_labels = len(uniques)
        if verbose:
            label_strengths = [
                np.median(strengths[clusids == l]) for l in uniques
            ]
            label_counts = [np.sum(clusids == l) for l in uniques]
            print("# clusters found:", n_labels)
            print(
                f"cluster sizes:   min:{np.min(label_counts)}   mean:{np.mean(label_counts)}   max:{np.max(label_counts)}"
            )
            print(
                f"median label strengths:   min:{np.min(label_strengths)}   mean:{np.mean(label_strengths)}   max:{np.max(label_strengths)}"
            )

        self.svms = [sklearn.svm.SVR() for _ in range(n_labels)]
        for l, svm in zip(uniques, self.svms):
            indices = clusids == l
            print(l, indices.sum())
            svm.fit(df[indices], labels[indices])