Python Birch.threshold Examples

Programming Language: Python

Namespace/Package Name: sklearn.cluster

Class/Type: Birch

Method/Function: threshold

Examples at hotexamples.com: 1

Python Birch.threshold - 1 examples found. These are the top rated real world Python examples of sklearn.cluster.Birch.threshold extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Birch(30)

fit(30)

fit_predict(30)

predict(30)

partial_fit(17)

fit_transform(5)

set_params(3)

transform(3)

__init__(1)

_get_leaves(1)

branching_factor(1)

get_feature_names_out(1)

threshold(1)

Example #1

Show file

File: birch.py Project: waltersom/bigslice

    def run(run_id: int,
            database: Database,
            complete_only: bool = True,
            threshold: np.float = -1,
            threshold_percentile: np.float = -1,
            random_seed: int = randint(1, 9999999)):
        """ run clustering and returns object """
        def preprocess(features: np.array):
            # todo: preprocess!
            preprocessed_features = features.astype(np.float)
            return preprocessed_features

        def fetch_threshold(df: pd.DataFrame,
                            percentile: np.float,
                            num_iter: int = 100,
                            num_sample: int = 1000):
            seed(random_seed)  # to make things reproducible
            if df.shape[0] < num_sample:
                num_sample = df.shape[0]
                num_iter = 1
            threshold = np.array([
                np.percentile(
                    pairwise_distances(df.sample(num_sample,
                                                 random_state=randint(
                                                     0, 999999)).values,
                                       metric='euclidean',
                                       n_jobs=-1), percentile)
                for i in range(num_iter)
            ]).mean()
            return threshold

        # set properties
        properties = {
            "run_id": run_id,
            "random_seed": random_seed,
            "method": "birch"
        }

        # prepare features_df
        if complete_only:
            selector = " AND bgc.on_contig_edge is 0"
        else:
            selector = ""
        bgc_ids = [
            row[0]
            for row in database.select("bgc,run_bgc_status",
                                       "WHERE run_bgc_status.run_id=?" +
                                       " AND run_bgc_status.bgc_id=bgc.id" +
                                       selector,
                                       parameters=(run_id, ),
                                       props=["bgc.id"],
                                       as_tuples=True)
        ]
        if len(bgc_ids) < 1:  # check if no bgc_ids
            raise Exception("Not enough input for clustering.")
        hmm_ids = [
            row[0] for row in database.select("hmm,run",
                                              "WHERE hmm.db_id=run.hmm_db_id" +
                                              " AND run.id=?",
                                              parameters=(run_id, ),
                                              props=["hmm.id"],
                                              as_tuples=True)
        ]
        features_df = pd.DataFrame(np.zeros((len(bgc_ids), len(hmm_ids)),
                                            dtype=np.uint8),
                                   index=bgc_ids,
                                   columns=hmm_ids)

        # fetch feature values from db
        for bgc_id, hmm_id, value in database.select(
                "bgc,run_bgc_status,run,hmm,bgc_features",
                "WHERE run_bgc_status.bgc_id=bgc.id" + " AND run.id=?" +
                " AND run_bgc_status.bgc_id=bgc.id" +
                " AND run.id=run_bgc_status.run_id" +
                " AND run.hmm_db_id=hmm.db_id" +
                " AND bgc_features.bgc_id=bgc.id" +
                " AND bgc_features.hmm_id=hmm.id" + selector,
                parameters=(run_id, ),
                props=[
                    "bgc_features.bgc_id", "bgc_features.hmm_id",
                    "bgc_features.value"
                ],
                as_tuples=True):
            features_df.at[bgc_id, hmm_id] = value

        # initiate birch object
        birch = Birch(
            n_clusters=None,  # no global clustering
            compute_labels=False,  # only calc centroids
            copy=False  # data already copied
        )

        # set threshold
        if threshold >= 0:
            birch.threshold = threshold
        else:
            if threshold_percentile < 0:
                raise Exception("Threshold percentile can't be < 0.00")
            # set threshold based on sampling of features
            birch.threshold = fetch_threshold(features_df,
                                              threshold_percentile)
        properties["threshold"] = birch.threshold

        # set flat birch
        birch.branching_factor = features_df.shape[0]

        # call birch
        birch.fit(preprocess(features_df.values))

        # save centroids
        properties["centroids"] = pd.DataFrame(np.uint8(
            birch.subcluster_centers_),
                                               columns=features_df.columns)

        return BirchClustering(properties)