Esempio n. 1
0
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data using Gaussian Mixture Models.

    :param data: pd.DataFrame with features for clustering indexed by users (sessions)
    :param max_n_clusters: maximal number of clusters for automatic selection for number of clusters.
        if None, then use n_clusters from arguments
    :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time)
    :param random_state: random state for GaussianMixture clusterer
    :param kwargs: keyword arguments for sklearn.mixture.GaussianMixture
    :return: np.array of clusters
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters,
                                      random_state, **kwargs)
    else:
        kmargs = {
            i: j
            for i, j in kwargs.items()
            if i in GaussianMixture.get_params(GaussianMixture)
        }
    kmargs.update({'random_state': random_state})
    km = GaussianMixture(**kmargs)
    cl = km.fit_predict(data.values)
    km.labels_ = cl
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics
Esempio n. 2
0
def em(X, **kwargs):
    """
    Simply wrapper for the EM algorithm because .fit does not create the 
    attribute labels_
    """
    model = GaussianMixture(**kwargs)
    labels = model.fit_predict(X)
    model.labels_ = labels
    return model
Esempio n. 3
0
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data using Gaussian Mixture Models.

    Parameters
    --------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    max_n_clusters: int, optional
        Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses ```n_clusters`` from arguments. Default: `None```
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    random_state: int, optional
        Random state for GaussianMixture clusterer.
    kwargs: optional
        Parameters for ``sklearn.mixture.GaussianMixture``

    Returns
    --------
    Array of clusters

    Return type
    --------
    np.array
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters,
                                      random_state, **kwargs)
    else:
        kmargs = {
            i: j
            for i, j in kwargs.items()
            if i in GaussianMixture.get_params(GaussianMixture)
        }
    kmargs.update({'random_state': random_state})
    km = GaussianMixture(**kmargs)
    cl = km.fit_predict(data.values)
    km.labels_ = cl
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics
print("Part 1: Clustering")
print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))


print(79 * '_')
print('% 9s' % 'init\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette\tAccuracy')

kmeans = KMeans(n_clusters=2, random_state=0).fit(data)
float(sum(kmeans.labels_ == labels))/float(len(labels))
metrics.homogeneity_score(labels,kmeans.labels_)
metrics.completeness_score(labels, kmeans.labels_)

EMax = GaussianMixture(n_components=20,random_state=0).fit(data)
# EMax = GMM(n_components=2,random_state=0).fit(data)
EMax.labels_ = EMax.predict(data)
float(sum(EMax.labels_ == labels))/float(len(labels))
metrics.homogeneity_score(labels,EMax.labels_)
metrics.completeness_score(labels, EMax.labels_)

def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), 
             metrics.homogeneity_score(labels, estimator.predict(data)),
             metrics.completeness_score(labels, estimator.predict(data)),
             metrics.v_measure_score(labels, estimator.predict(data)),
             metrics.adjusted_rand_score(labels, estimator.predict(data)),
             metrics.adjusted_mutual_info_score(labels,  estimator.predict(data)),
             metrics.silhouette_score(data, estimator.predict(data),metric='euclidean',sample_size=sample_size),
Esempio n. 5
0
def gaussian_mixture(
    X,
    n_clusters=5,
    covariance_type="full",
    best_model=False,
    max_clusters=10,
    random_state=None,
    **kwargs,
):
    """Clustering with Gaussian Mixture Model.

    Parameters
    ----------
    X  : array-like
        n x k attribute data
    n_clusters : int, optional, default: 5
        The number of clusters to form.
    covariance_type: str, optional, default: "full""
        The covariance parameter passed to scikit-learn's GaussianMixture
        algorithm
    best_model: bool, optional, default: False
        Option for finding endogenous K according to Bayesian Information
        Criterion
    max_clusters: int, optional, default:10
        The max number of clusters to test if using `best_model` option
    random_state: int, optional, default: None
        The seed used to generate replicable results
    kwargs

    Returns
    -------
    fitted cluster instance: sklearn.mixture.GaussianMixture

    """
    if random_state is None:
        warn(
            "Note: Gaussian Mixture Clustering is probabilistic--"
            "cluster labels may be different for different runs. If you need consistency, "
            "you should set the `random_state` parameter")

    if best_model is True:

        # selection routine from
        # https://plot.ly/scikit-learn/plot-gmm-selection/
        lowest_bic = np.infty
        bic = []
        maxn = max_clusters + 1
        n_components_range = range(1, maxn)
        cv_types = ["spherical", "tied", "diag", "full"]
        for cv_type in cv_types:
            for n_components in n_components_range:
                # Fit a Gaussian mixture with EM
                gmm = GaussianMixture(
                    n_components=n_components,
                    random_state=random_state,
                    covariance_type=cv_type,
                )
                gmm.fit(X)
                bic.append(gmm.bic(X))
                if bic[-1] < lowest_bic:
                    lowest_bic = bic[-1]
                    best_gmm = gmm

        bic = np.array(bic)
        model = best_gmm

    else:
        model = GaussianMixture(
            n_components=n_clusters,
            random_state=random_state,
            covariance_type=covariance_type,
        )
    model.fit(X)
    model.labels_ = model.predict(X)
    return model
Esempio n. 6
0
    def identify_subnetworks_sub(self, geneset_obj):
        #[1]
        edge_path = self.subnetworks_dir + ss + "%s_edges.txt" % geneset_obj.id
        edge_file = open(edge_path, 'w')

        #[2]
        self.gene_dic = {}
        for i, gene_id in enumerate(geneset_obj.gene_id_list):
            gene_obj = Gene()
            gene_obj.id = gene_id
            gene_obj.index = i + 1
            self.gene_dic[gene_id] = gene_obj
            self.gene_dic[gene_obj.index] = gene_obj
            ##End for

        #[2]
        for a, b in itertools.combinations(geneset_obj.gene_id_list, 2):
            #[2-1]
            key = a, b
            if key not in self.ppi_set:
                continue
                ##End if
            #[2-2]
            c, d = [self.gene_dic[x] for x in [a, b]]
            edge_line = make_line([c.index, d.index], tt)
            edge_file.write(edge_line + nn)
            ##End for

        #[3]
        edge_file.close()
        embedding_path = self.subnetworks_dir + ss + "%s_embeddings.txt" % geneset_obj.id
        log_path = self.subnetworks_dir + ss + "%s_log.txt" % geneset_obj.id
        cmd = "nohup deepwalk --input %s --output %s --representation-size 8 --seed 0 > %s" % (
            edge_path, embedding_path, log_path)
        os.system(cmd)

        #[4]
        embedding_file = open(embedding_path, 'r')
        embedding_file.readline()
        gene_id_list = []
        gene_embedding_arr = []
        for embedding_line in embedding_file:
            index = int(embedding_line.split()[0])
            gene_id = self.gene_dic[index].id
            embedding_vec = [float(x) for x in embedding_line.split()[1:]]
            gene_id_list.append(gene_id)
            gene_embedding_arr.append(embedding_vec)
            ##End for

        #[5]
        gene_embedding_arr = np.array(gene_embedding_arr)
        clusterer_list = []
        max_clusters = int(len(gene_id_list)**0.5) + 1
        for n_clusters in range(2, max_clusters):
            clusterer_obj = GaussianMixture(n_components=n_clusters,
                                            random_state=0)
            clusterer_obj.fit(gene_embedding_arr)
            clusterer_obj.score_ = -clusterer_obj.bic(gene_embedding_arr)
            clusterer_obj.n_clusters_ = n_clusters
            clusterer_obj.labels_ = clusterer_obj.predict(gene_embedding_arr)
            clusterer_list.append(clusterer_obj)
            ##End for

        #[6]
        clusterer_obj = sorted(clusterer_list,
                               key=lambda x: x.score_,
                               reverse=True)[0]
        subnet_dic = {}
        for gene_id, label in zip(gene_id_list, clusterer_obj.labels_):
            #[6-1]
            if label not in subnet_dic:
                subnet_obj = Subnet()
                subnet_obj.gene_id_list = []
                subnet_dic[label] = subnet_obj
                ##End if
            #[6-2]
            subnet_obj = subnet_dic[label]
            subnet_obj.gene_id_list.append(gene_id)
            ##End for

        #[7]
        subnet_list = sorted(subnet_dic.values(),
                             key=lambda x: len(x.gene_id_list),
                             reverse=True)
        subnet_list = list(
            filter(lambda x: len(x.gene_id_list) > 2, subnet_list))
        for i, subnet_obj in enumerate(subnet_list):
            subnet_obj.id = "%s_%s" % (geneset_obj.id, i + 1)
            gene_id_line = make_line(sorted(subnet_obj.gene_id_list), cc)
            subnet_line = make_line([subnet_obj.id, gene_id_line], tt)
            self.output_file.write(subnet_line + nn)