def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    agglo_mean.fit(X)
    agglo_median.fit(X)
    assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters)
    assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters)
    assert_true(np.size(agglo_mean.labels_) == X.shape[1])
    assert_true(np.size(agglo_median.labels_) == X.shape[1])

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert_true(Xt_mean.shape[1] == n_clusters)
    assert_true(Xt_median.shape[1] == n_clusters)
    assert_true(Xt_mean == np.array([1 / 3.]))
    assert_true(Xt_median == np.array([0.]))

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert_true(np.unique(X_full_mean[0]).size == n_clusters)
    assert_true(np.unique(X_full_median[0]).size == n_clusters)

    assert_array_almost_equal(agglo_mean.transform(X_full_mean),
                              Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median),
                              Xt_median)
Example #2
0
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9):
    numeric_cols = numeric_cols
    numdf = df[numeric_cols]
    r_in_x = numdf.corr()
    r_in_x = abs(r_in_x)
    distance_in_x = 1 / r_in_x
    for i in range(r_in_x.shape[0]):
            distance_in_x.iloc[i, i] = 10 ^ 10


    cpdist = distance_in_x.copy()

    cpdist = cpdist.fillna(cpdist.max().max())
    #df.isna().sum()

    from scipy.spatial.distance import correlation
    from sklearn.cluster import FeatureAgglomeration

    corrcoefmin = corrCoefThres
    fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin)
    fa.fit(cpdist)

    numdf.shape[1]
    fa.n_clusters_

    fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_})

    selectedFeatures = fadf.groupby("label").head(1)["feature"].values
    return selectedFeatures
Example #4
0
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    with pytest.warns(None) as record:
        agglo_mean.fit(X)
    assert not len(record)
    with pytest.warns(None) as record:
        agglo_median.fit(X)
    assert not len(record)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.0])
    assert Xt_median == np.array([0.0])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def _feature_agglomeration_fit_method(data, n_parcels, connectivity, linkage):
    """Feature Agglomeration algorithm to fit on the data.

    Parameters
    ----------
    data : array_like, shape=(n_samples, n_voxels)
        Masked subjects data

    n_parcels : int
        Number of parcels to parcellate.

    connectivity : ndarray
        Connectivity matrix
        Defines for each feature the neighbouring features following a given
        structure of the data.

    linkage : str
        which linkage criterion to use.
        'ward' or 'linkage' or 'average'

    Returns
    -------
    labels : ndarray
        Labels to the data
    """
    ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity,
                                linkage=linkage)
    ward.fit(data)

    return ward.labels_
Example #8
0
def feature_agglomeration(voters_data, n, rounding=False):
    featagg = FeatureAgglomeration(n_clusters=n)
    featagg.fit(voters_data)
    condensed = featagg.transform(voters_data)

    feature_groups_map = dict(zip(voters_data.columns, featagg.labels_))
    feature_groups_nos = []
    for feature_group_key in feature_groups_map:
        feature_groups_nos.append(feature_groups_map[feature_group_key])
    feature_groups_nos

    group_labels = []
    for feature_group_no in set(feature_groups_nos):
        group_label = ""
        for feature_groups_key in feature_groups_map:
            if feature_groups_map[feature_groups_key] == feature_group_no:
                group_label = group_label + feature_groups_key + ", "
        group_labels.append(group_label[0:-2])
    group_labels

    voters_agglomerated = pd.DataFrame(condensed,
                                       columns=group_labels,
                                       index=voters_data.index)
    if rounding == True:
        voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x))
    print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format(
        len(voters_data.columns), len(voters_agglomerated.columns)))
    return voters_agglomerated
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
Example #10
0
def feature_agglomeration(X, args={}):
    """
    使用层次聚类对特征进行聚类,然后进行特征降维
    """
    from sklearn.cluster import FeatureAgglomeration
    fam = FeatureAgglomeration(**args)
    fam.fit(X)
    return fam
Example #11
0
class FeatureAgglomerationDecomposer(Transformer):
    type = 11

    def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean',
                 random_state=1):
        super().__init__("feature_agglomeration_decomposer")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = NUMERICAL

        self.n_clusters = n_clusters
        self.affinity = affinity
        self.linkage = linkage
        self.pooling_func = pooling_func
        self.random_state = random_state

        self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max)

    @ease_trans
    def operate(self, input_datanode, target_fields=None):
        from sklearn.cluster import FeatureAgglomeration

        X, y = input_datanode.data

        if self.model is None:
            self.n_clusters = int(self.n_clusters)

            n_clusters = min(self.n_clusters, X.shape[1])
            if not callable(self.pooling_func):
                self.pooling_func = self.pooling_func_mapping[self.pooling_func]

            self.model = FeatureAgglomeration(
                n_clusters=n_clusters, affinity=self.affinity,
                linkage=self.linkage, pooling_func=self.pooling_func)
            self.model.fit(X)

        X_new = self.model.transform(X)

        return X_new

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        cs = ConfigurationSpace()
        n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25)
        affinity = CategoricalHyperparameter(
            "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean")
        linkage = CategoricalHyperparameter(
            "linkage", ["ward", "complete", "average"], default_value="ward")
        pooling_func = CategoricalHyperparameter(
            "pooling_func", ["mean", "median", "max"], default_value="mean")

        cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func])

        affinity_and_linkage = ForbiddenAndConjunction(
            ForbiddenInClause(affinity, ["manhattan", "cosine"]),
            ForbiddenEqualsClause(linkage, "ward"))
        cs.add_forbidden_clause(affinity_and_linkage)
        return cs
Example #12
0
def untangle(X: Iterable,
             y: Iterable,
             n_clusters: int = None,
             get_connectivity: bool = True,
             compute_distances: bool = True,
             kind: str = 'correlation',
             agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration:

    from nilearn.connectome import ConnectivityMeasure as CM
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.covariance import LedoitWolf
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import mutual_info_classif

    agglo_defs = dict(affinity='euclidean',
                      compute_full_tree='auto',
                      linkage='ward',
                      pooling_func=np.mean,
                      distance_threshold=None,
                      compute_distances=compute_distances)

    if get_connectivity is True:
        connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0]
    else:
        connect_mat = None

    if n_clusters is None:
        n_clusters = divmod(X.shape[1], 2)[0] - 1
        if n_clusters == 0:
            n_clusters = 1

    if agglo_kws is None:
        agglo_kws = {}
    agglo_defs.update(agglo_kws)

    agglo = FeatureAgglomeration(n_clusters=n_clusters,
                                 connectivity=connect_mat,
                                 **agglo_defs)
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    agglo.fit(X, y)

    setattr(
        agglo, 'cluster_indexes_',
        pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_),
                     columns=['cluster',
                              'feature']).groupby('cluster').feature)

    skb = SelectKBest(k=1, score_func=mutual_info_classif)
    factor_leaders_ = [
        skb.fit(X[itm[1]], y).get_feature_names_out()[0]
        for itm in tuple(agglo.cluster_indexes_)
    ]
    setattr(agglo, 'factor_leaders_', factor_leaders_)
    return agglo
def get_encoder(metas, train_data, target_output_dim):
    tmpdir = metas['workspace']
    model_path = os.path.join(tmpdir, 'feature_agglomeration.model')

    model = FeatureAgglomeration(n_clusters=target_output_dim)
    model.fit(train_data)
    pickle.dump(model, open(model_path, 'wb'))

    return FeatureAgglomerationEncoder(model_path=model_path)
Example #14
0
def test_feature_agglomeration_feature_names_out():
    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
    X, _ = make_blobs(n_features=6, random_state=0)
    agglo = FeatureAgglomeration(n_clusters=3)
    agglo.fit(X)
    n_clusters = agglo.n_clusters_

    names_out = agglo.get_feature_names_out()
    assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)],
                       names_out)
Example #15
0
def get_clusters(X: pd.DataFrame, n_clusters: int):
    clt = FeatureAgglomeration(n_clusters=n_clusters)
    clt.fit(X)

    clusters = []

    for i in range(n_clusters):
        clusters.append(X.columns[clt.labels_ == i].tolist())

    return clusters  # type: list[str]
def main():

    # Parameters
    data_directory = '../../data/generated-data-r-10-n-6-4/'
    features_path = '../../data/features-generated-data-r-10-n-6-4'
    booking_file = '../../data/booking.csv'
    users_file = '../../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5]
    false_objects_indexes = [6, 7, 8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores_auc = []
    scores_rmse = []
    for i in range(10):
        cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='rf')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv'
    save_scores(scores_rmse, results_file)
Example #17
0
 def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15):
     X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df)
     agglo = FeatureAgglomeration(n_clusters=n_clusters)
     if len(X_transformed) > 20000:
         X_agglo = X_transformed.sample(20000)
     else:
         X_agglo = X_transformed.copy()
     agglo.fit(X_agglo)
     vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(),
                                        'cluster':list(agglo.labels_)})\
                       .sort_values('cluster')
     return vars_clusters, X_transformed
Example #18
0
 def cont_feature_clusters_sklearn(self, n_clusters = 5):
     """ This uses feature agglomeration from scikit learn and only works for continuous variables
         Eventually expand this to categorical variables using Cramer's V covariance matrix similar to 
         R tool using the iclust package """   
         
     #Import the library
     from sklearn.cluster import FeatureAgglomeration
     
     Cluster = FeatureAgglomeration(n_clusters=n_clusters)
     Cluster.fit(self._dataset.iloc[:,self._cont_index_predictors])
     
     df = pd.DataFrame({'Variable':self._dataset.columns[self._cont_index_predictors], 'Cluster':Cluster.labels_})
     
     return df.sort_values(by='Cluster')
Example #19
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices,
                        connectivity, n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Example #20
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity,
                        n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Example #21
0
def cluster_sentences(sentences, nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                       stop_words=stopwords.words('english'),
                                       max_df=0.9,
                                       min_df=0.05,
                                       lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences)
    tfidf_matrix = tfidf_matrix_1.todense()
    kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)
    return dict(clusters)
Example #22
0
def data_compression(fmri_masked, mask_img, mask_np, output_size):
    """
    data : array_like
         A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
         The functional dataset that needs to be reduced
    mask : a numpy array of the mask
    output_size : integer
        The number of elements that the data should be reduced to
        
    """

    ## Transform nifti files to a data matrix with the NiftiMasker
    import time
    from nilearn import input_data

    datacompressiontime = time.time()
    nifti_masker = input_data.NiftiMasker(mask_img=mask_img,
                                          memory='nilearn_cache',
                                          mask_strategy='background',
                                          memory_level=1,
                                          standardize=False)

    ward = []

    # Perform Ward clustering
    from sklearn.feature_extraction import image
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    #import pdb;pdb.set_trace()
    from sklearn.cluster import FeatureAgglomeration
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=output_size,
                                connectivity=connectivity,
                                linkage='ward')
    ward.fit(fmri_masked)
    #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))

    labels = ward.labels_

    #print ('Extracting reduced Dimension Data')
    data_reduced = ward.transform(fmri_masked)
    fmri_masked = []
    #print('Data compression took ', (time.time()- datacompressiontime), ' seconds')
    return {'data': data_reduced, 'labels': labels}
Example #23
0
def data_compression(fmri_masked, mask_img, mask_np, compression_dim):
    # TODO @AKI update doc
    """
    Perform...
    
    Parameters
    ----------
    fmri_masked : np.ndarray[ndim=2]
           A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
           The functional dataset that needs to be reduced
    mask_img : an nibabel img object of the mask
    mask_np : a numpy array of the mask
    compression_dim : integer
        The number of elements that the data should be reduced to

    Returns
    -------
    A dictionaty ...

    """

    from sklearn.feature_extraction import image
    from sklearn.cluster import FeatureAgglomeration

    # Perform Ward clustering
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    ward = FeatureAgglomeration(n_clusters=compression_dim,
                                connectivity=connectivity,
                                linkage='ward')

    ward.fit(fmri_masked)

    labels = ward.labels_
    data_reduced = ward.transform(fmri_masked)

    return {
        'compressor': ward,
        'compressed': data_reduced,
        'labels': labels,
    }
Example #24
0
 def agglomeration(self, n_clusters):
     cv = KFold(len(self.y_train), 5)
     agglo = FeatureAgglomeration(n_clusters=n_clusters)
     agglo.fit(self.X_train)
     grouped_names = list([])
     grouped_idx = list([])
     col_idx = np.arange(len(self.cols))
     for label in np.unique(agglo.labels_):
         group = [
             name for (name, group_id) in zip(self.cols, agglo.labels_)
             if group_id == label
         ]
         group_idx = [
             idx for (idx, group_id) in zip(col_idx, agglo.labels_)
             if group_id == label
         ]
         grouped_names.append(group)
         grouped_idx.append(group_idx)
     return grouped_names, grouped_idx
Example #25
0
def test_ward_agglomeration():
    # Check that we obtain the correct solution in a simplistic case
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert np.size(np.unique(agglo.labels_)) == 5

    X_red = agglo.transform(X)
    assert X_red.shape[1] == 5
    X_full = agglo.inverse_transform(X_red)
    assert np.unique(X_full[0]).size == 5
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    with pytest.raises(ValueError):
        agglo.fit(X[:0])
Example #26
0
class Hierarchi(object):
    """All hierarchical algorithms are implemened here."""

    def __init__(self, method, data, n_clusters=2, random_state=0):
        """
        Initialize all the parameters.
        method: Name of the algorithms (lower case joined by underscore)
        data: Data (2D Matrix)
        n_clusters: Number of clusters
        random_state: Random initial state
        """
        self.method = method
        self.data = data
        self.n_clusters = n_clusters
        np.random.seed(random_state)

        self.affinity = "euclidean"
        self.linkage = "ward"
        self.distance_threshold = None
        return

    def setup(self, keywords={}):
        """
        Setup the algorithms
        """
        for p in keywords.keys():
            setattr(self, p, keywords[p])

        if self.method == "agglomerative": self.obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage, 
                affinity=self.affinity)
        if self.method == "feature": self.obj = FeatureAgglomeration(n_clusters=self.n_clusters, linkage=self.linkage,
                                affinity=self.affinity, distance_threshold=self.distance_threshold)
        return

    def run(self):
        """
        Run the models
        """
        if self.method == "agglomerative": self.obj.fit(self.data)
        if self.method == "feature": self.obj.fit(self.data)
        return
Example #27
0
def run_evaluation():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', help="Image folder.", default="faces")
    parser.add_argument('--output',
                        help="Statistics output folder.",
                        default="stats")
    args = parser.parse_args()

    # load embeddings
    emb_1 = load_embeddings('embeddings_matthias.pkl')
    emb_2 = load_embeddings('embeddings_laia.pkl')
    emb_3 = load_embeddings('embeddings_elias.pkl')
    emb_lfw = load_embeddings('embeddings_lfw.pkl')

    if emb_1 is None or emb_2 is None:
        print "--- embeddings could not be loaded. Aborting..."
        return

    # ------------------- EVALUATION ON ORIGINAL VECTORS

    ph = PlotHandler()

    # ==== 1. PCA DIMENSION REDUCTION

    # ph.PlotVarianceContribution(emb_lfw)
    # # reduce dimensionality
    # basis, mean = ExtractSubspace(emb_lfw, 0.999)
    # # dump_to_hd("lfw_99.9_subspace.pkl", (basis, mean))
    #
    # reduced_data = ProjectOntoSubspace(emb_lfw, mean, basis)
    # ph.SetTitle("Component Variance Contribution on Subspace")
    # ph.PlotVarianceContribution(reduced_data)
    # ph.Show()

    # ==== 1. FEATURE AGGLOMERATION

    agglo = FeatureAgglomeration(n_clusters=20)
    agglo.fit(emb_lfw)
    X_reduced = agglo.transform(emb_1)

    print np.shape(X_reduced)
Example #28
0
def perform_feature_agglomeration(train_X, train_Y, test_X, test_Y):
    n_clusters = [32]
    fagg_model_accuracies = pd.DataFrame()
    for n_cluster in n_clusters:
        agglo = FeatureAgglomeration(connectivity=None, n_clusters=n_cluster)
        agglo.fit(train_X)
        train_X_reduced = agglo.transform(train_X)
        test_X_reduced = agglo.transform(test_X)

        svc_acc_val = perform_svc(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        rfc_acc_val = perform_rfc(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        knn_acc_val = perform_knn(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        lr_acc_val = perform_linear_regression(train_X_reduced, train_Y,
                                               test_X_reduced, test_Y)

        lc_acc_val = perform_linear_lasso(train_X_reduced, train_Y,
                                          test_X_reduced, test_Y)

        rr_acc_val = perform_ridge_regression(train_X_reduced, train_Y,
                                              test_X_reduced, test_Y)

        enet_acc_val = perform_elastinet_regression(train_X_reduced, train_Y,
                                                    test_X_reduced, test_Y)

        fagg_model_accuracies = fagg_model_accuracies.append([
            svc_acc_val, rfc_acc_val, knn_acc_val, lr_acc_val, lc_acc_val,
            rr_acc_val, enet_acc_val
        ])
        cols = list(fagg_model_accuracies.columns.values)
        cols = cols[-1:] + cols[:-1]
        fagg_model_accuracies = fagg_model_accuracies[cols]
        fagg_model_accuracies = fagg_model_accuracies.sort_values(
            by='r2_score')
    return fagg_model_accuracies
Example #29
0
def construction_pathway_clusters_groups(data_path=data_tn_new_label_unbalanced_cpg_rna_rna_iso_mirna,
                                         model_loaded=False,
                                         return_views='all',
                                         output_file_name='pathway_file_clusters_genes',
                                         model_agglomeration_file_name='feature_agglomeration_model.pck'):
    """
       Construct group based on clustering
       Args:
           data_path: str, data path
           model_loaded: bool, if true, load the model_agglomeration_file_name, if false re-do the fit
           return_views: str, correct view for the group
           output_file_name: str, output file name
           model_agglomeration_file_name: path to .pck file if we already run the feature agglomeration fit
       Returns:
           output_file_name
       """
    from sklearn.cluster import FeatureAgglomeration
    agglo = FeatureAgglomeration(n_clusters=1000)
    x, y, features_names, _ = load_data(data=data_path, return_views=return_views)
    output_file_name = output_file_name + '_{}.tsv'.format(return_views)
    if model_loaded:
        assert model_agglomeration_file_name != '', 'You should give the model agglomeration name file'
        f = open(model_agglomeration_file_name, 'rb')
        agglo = pickle.load(f)
        groups_and_features = list(zip(features_names, agglo.labels_))
        with open(output_file_name, 'w') as f:
            f.write('G\tIDS\n')
            for zip_el in groups_and_features:
                f.write('{}\t{}\n'.format(zip_el[1], zip_el[0]))
    else:
        f = open(model_agglomeration_file_name, 'wb')
        agglo.fit(x)
        pickle.dump(agglo, f)
        groups_and_features = list(zip(features_names, agglo.labels_))
        with open(output_file_name, 'w') as f:
            f.write('G\tIDS\n')
            for zip_el in groups_and_features:
                f.write('{}\t{}\n'.format(zip_el[1], zip_el[0]))
Example #30
0
def FeatureClusters(p_data,
                    p_predictors,
                    p_numeric_cat_index=np.array([]),
                    p_n_clusters=5):
    """ This uses feature agglomeration from scikit learn and only works for continuous variables
        Eventually expand this to categorical variables using Cramer's V covariance matrix similar to 
        R tool using the iclust package """

    # Find clusters of correlated (continuous) variables
    cont_index = np.intersect1d(p_predictors,
                                ContCatSplit(p_data, p_numeric_cat_index)[0])

    #Import the library
    from sklearn.cluster import FeatureAgglomeration

    Cluster = FeatureAgglomeration(n_clusters=p_n_clusters)
    Cluster.fit(p_data.iloc[:, cont_index])

    df = pd.DataFrame({
        'Variable': p_data.columns[cont_index],
        'Cluster': Cluster.labels_
    })

    return df.sort_values(by='Cluster')
Example #31
0
def test_random_feature_agglomeration_encoder_load():
    train_data = np.random.rand(2000, input_dim)

    from sklearn.cluster import FeatureAgglomeration
    model = FeatureAgglomeration(n_clusters=target_output_dim)
    filename = 'feature_agglomeration_model.model'
    pickle.dump(model.fit(train_data), open(filename, 'wb'))

    encoder = TransformEncoder(model_path=filename)

    test_data = np.random.rand(10, input_dim)
    encoded_data = encoder.encode(test_data)
    transformed_data = model.transform(test_data)
    assert encoded_data.shape == (test_data.shape[0], target_output_dim)
    assert type(encoded_data) == np.ndarray
    np.testing.assert_almost_equal(transformed_data, encoded_data)

    save_and_load(encoder, False)
    save_and_load_config(encoder, False, train_data)

    rm_files([encoder.save_abspath, encoder.config_abspath, filename])
Example #32
0
def projection(X, k, connectivity, ward=True):
    """
    Take the data, and returns a matrix, to reduce the dimension
    Returns, invP, (P.(X.T)).T and the underlying labels
    """
    n, p = X.shape
    if ward:
        clustering = FeatureAgglomeration(
            linkage='ward', n_clusters=k, connectivity=connectivity)
        labels = clustering.fit(X).labels_
    else:
        from fast_cluster import ReNN, recursive_nn
        _, labels = recursive_nn(connectivity, X, n_clusters=k)

    #
    P, P_inv = pp_inv(labels)
    X_proj = P.dot(X.T).T
    # should be done through clustering.transform, but there is an issue
    # with the normalization
    # X_proj = clustering.transform(X)
    return P_inv, X_proj, labels
# Transpose and scale parameters
featListOriginal[[0,1,5,7,9,10],:] = featListOriginal[[0,1,5,7,9,10],:]*0.8
featListOriginal[[2,3,4],:] = featListOriginal[[2,3,4],:]*0.8*0.2
featListOriginal[6,:] = featListOriginal[6,:]*0.8*0.8
featListOriginal = NP.transpose(featListOriginal)

## STANDARDIZE FEATURES ###################################################

# Don't standardize the centroids
for k in range(2,numFeat):
	featList[k] = (featList[k] - NP.mean(featList[k]))/NP.sqrt(NP.var(featList[k]))

# Transpose the feature list to use in clustering
featList = NP.transpose(featList)
feat_aggl = FeatureAgglomeration(2)
feat_aggl.fit(featList[:,2:])

## AGGLOMERATIVE CLUSTERING ###############################################

aggl_all = AgglomerativeClustering(2)
X_All = featList[:,2:]
y2 = aggl_all.fit_predict(X_All)

## PCA ###############################################################

pca_model = PCA(2)
X_PCA = pca_model.fit_transform(X_All)
print(pca_model.explained_variance_ratio_)

## SPLIT INTO numBINS ###############################################
percentiles = NP.floor(NP.linspace(0,100,numBins+1))
	def FeatureAgglomeration(self, clist, numClusters=2):
		FEATAGGL = FeatureAgglomeration(numClusters)
		FEATAGGL.fit(self.featList[:,clist])
		self.featureTree = FEATAGGL.children_	
		self.featureLabels = FEATAGGL.labels_
		self.featureCList = clist
    
def find_cluster(childrens, start, height, sample_size):
	res = start
	for i in range(height - 1):
		res = find_feature(childrens, res + sample_size)
	cluster = rec_cluster(childrens, childrens[res], sample_size)
	cluster.sort()
	return cluster
	
def find_feature_cluster(children, feature, height, sample_size):
	return find_cluster(children, find_feature(children, feature), height, sample_size)
		


BENCH_DATA = genfromtxt('Sequential_Application_SATUNSAT_track_wo_names.csv', delimiter=',')
#BENCH_DATA = BENCH_DATA.transpose()
print(BENCH_DATA.shape)
#print(np.isnan(BENCH_DATA).any())

ward = FeatureAgglomeration(linkage='average')

#print(ward.fit_predict(BENCH_DATA))
ward.fit(BENCH_DATA)
#print(ward.children_)
#print(find_feature_cluster(ward.children_, 0, 2, 300))

plt.title('SAT Feature_Agglomeration')
plot_dendrogram(ward, leaf_font_size = 12)
#plt.savefig('SAT_Feature_Agglomeration.png')
plt.show()
Example #36
0
out_dir='/Users/aki.nikolaidis/PyBASC_outputs/Self_Sim_WWS/dim_500_correlation_2_clusters_100_IndBS_1_blockcorrelation'
ismdir=out_dir + '/workflow_output/basc_workflow_runner/basc/individual_stability_matrices/mapflow/'
os.chdir(ismdir)
subdirs_all = [x[1] for x in os.walk(ismdir)]                                                                            
subdirs=subdirs_all[0]
n_clusters=2
    #roi_mask_nparray = nb.load(roi_mask_file).get_data().astype('float32').astype('bool')
ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward')    


for subdir in subdirs:
    os.chdir(ismdir + subdir)
    temp_ism_file = os.path.join(os.getcwd(), 'individual_stability_matrix.npy')
    temp_ism=np.load(temp_ism_file)
    print("Calculating Hierarchical Clustering")
    ward.fit(temp_ism)
    y_pred = ward.labels_.astype(np.int)
    #APPLY CLUSTERING
    all_ind_clusterlabels.append(y_pred)
    
    

#for ism in ismlist:
    #temp=np.load(ism)
    #all_ind_clusterlabels.append(temp)
    

score_similarities= np.zeros((len(all_ind_clusterlabels),len(all_ind_clusterlabels)))

for i in range(len(all_ind_clusterlabels)):
    for j in range(len(all_ind_clusterlabels)):
x_columns = X_copy.columns
X_copy = pd.DataFrame(poly.fit_transform(X_copy))
X_copy.columns = poly.get_feature_names(x_columns)

# add the term-document matrix to X
X_copy = pd.concat([X_copy, matrix], axis=1)

# drop any constant columns in X
X_copy = X_copy.loc[:, (X_copy != X_copy.iloc[0]).any()]

# standardize the data to take on values between 0 and 1
X = ((X_copy - X_copy.min()) / (X_copy.max() - X_copy.min())).copy()

# build the feature selection model
num = 222
hclust = FeatureAgglomeration(n_clusters=num,
                              linkage="ward",
                              distance_threshold=None)
hclust.fit(X)

# collect the features to keep
clusters = hclust.labels_
keep = []
for i in range(num):
    keep.append(np.where(clusters == i)[0][0])
X = X_copy.iloc[:, keep]

# export the data
X.to_csv("X titanic.csv", index=False)
Y.to_csv("Y titanic.csv", index=False)
Example #38
0
 def fit(self, data, n_clusters, method):
     data = np.array(data)
     data = preprocessing.MinMaxScaler().fit_transform(data)
     model = FeatureAgglomeration(n_clusters=n_clusters, linkage=method)
     clustering = model.fit(data)
     return clustering
##################################################################
# Then we use FeatureAgglomeration from scikit-learn. Indeed, the voxels
# are the features of the data matrix.
#
# In addition, we use caching. As a result, the clustering doesn't have
# to be recomputed later.

# Computing the ward for the first time, this is long...
from sklearn.cluster import FeatureAgglomeration
# If you have scikit-learn older than 0.14, you need to import
# WardAgglomeration instead of FeatureAgglomeration
import time
start = time.time()
ward = FeatureAgglomeration(n_clusters=1000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 1000 clusters: %.2fs" % (time.time() - start))

# Compute the ward with more clusters, should be faster as we are using
# the caching mechanism
start = time.time()
ward = FeatureAgglomeration(n_clusters=2000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 2000 clusters: %.2fs" % (time.time() - start))

##################################################################
# Visualize results
# ------------------
#
# First we display the labels of the clustering in the brain.