Example #1
0
    def run(run_id: int,
            database: Database,
            complete_only: bool = True,
            threshold: np.float = -1,
            threshold_percentile: np.float = -1,
            random_seed: int = randint(1, 9999999)):
        """ run clustering and returns object """
        def preprocess(features: np.array):
            # todo: preprocess!
            preprocessed_features = features.astype(np.float)
            return preprocessed_features

        def fetch_threshold(df: pd.DataFrame,
                            percentile: np.float,
                            num_iter: int = 100,
                            num_sample: int = 1000):
            seed(random_seed)  # to make things reproducible
            if df.shape[0] < num_sample:
                num_sample = df.shape[0]
                num_iter = 1
            threshold = np.array([
                np.percentile(
                    pairwise_distances(df.sample(num_sample,
                                                 random_state=randint(
                                                     0, 999999)).values,
                                       metric='euclidean',
                                       n_jobs=-1), percentile)
                for i in range(num_iter)
            ]).mean()
            return threshold

        # set properties
        properties = {
            "run_id": run_id,
            "random_seed": random_seed,
            "method": "birch"
        }

        # prepare features_df
        if complete_only:
            selector = " AND bgc.on_contig_edge is 0"
        else:
            selector = ""
        bgc_ids = [
            row[0]
            for row in database.select("bgc,run_bgc_status",
                                       "WHERE run_bgc_status.run_id=?" +
                                       " AND run_bgc_status.bgc_id=bgc.id" +
                                       selector,
                                       parameters=(run_id, ),
                                       props=["bgc.id"],
                                       as_tuples=True)
        ]
        if len(bgc_ids) < 1:  # check if no bgc_ids
            raise Exception("Not enough input for clustering.")
        hmm_ids = [
            row[0] for row in database.select("hmm,run",
                                              "WHERE hmm.db_id=run.hmm_db_id" +
                                              " AND run.id=?",
                                              parameters=(run_id, ),
                                              props=["hmm.id"],
                                              as_tuples=True)
        ]
        features_df = pd.DataFrame(np.zeros((len(bgc_ids), len(hmm_ids)),
                                            dtype=np.uint8),
                                   index=bgc_ids,
                                   columns=hmm_ids)

        # fetch feature values from db
        for bgc_id, hmm_id, value in database.select(
                "bgc,run_bgc_status,run,hmm,bgc_features",
                "WHERE run_bgc_status.bgc_id=bgc.id" + " AND run.id=?" +
                " AND run_bgc_status.bgc_id=bgc.id" +
                " AND run.id=run_bgc_status.run_id" +
                " AND run.hmm_db_id=hmm.db_id" +
                " AND bgc_features.bgc_id=bgc.id" +
                " AND bgc_features.hmm_id=hmm.id" + selector,
                parameters=(run_id, ),
                props=[
                    "bgc_features.bgc_id", "bgc_features.hmm_id",
                    "bgc_features.value"
                ],
                as_tuples=True):
            features_df.at[bgc_id, hmm_id] = value

        # initiate birch object
        birch = Birch(
            n_clusters=None,  # no global clustering
            compute_labels=False,  # only calc centroids
            copy=False  # data already copied
        )

        # set threshold
        if threshold >= 0:
            birch.threshold = threshold
        else:
            if threshold_percentile < 0:
                raise Exception("Threshold percentile can't be < 0.00")
            # set threshold based on sampling of features
            birch.threshold = fetch_threshold(features_df,
                                              threshold_percentile)
        properties["threshold"] = birch.threshold

        # set flat birch
        birch.branching_factor = features_df.shape[0]

        # call birch
        birch.fit(preprocess(features_df.values))

        # save centroids
        properties["centroids"] = pd.DataFrame(np.uint8(
            birch.subcluster_centers_),
                                               columns=features_df.columns)

        return BirchClustering(properties)