Python FeatureAgglomeration Beispiele, sklearn.cluster.FeatureAgglomeration Python Beispiele

Beispiel #1

0

Datei anzeigen

def TrainRFRegression(df1):
    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29'])

    # build our design matrices
    y, X = dmatrices(eqn, data=df1, return_type='dataframe')

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y))
    reduced_X = X_reduction.transform(X)

    # define our regressor
    mod = RandomForestRegressor(n_estimators=50)

    # fit our data
    res = mod.fit(reduced_X, pd.np.ravel(y))

    # evaluate our fit
    yp = pd.DataFrame({'predicted': res.predict(reduced_X)})
    yp = yp['predicted']
    yt = y['regressand']
    r2 = metrics.r2_score(yt, yp)
    rmse = metrics.mean_absolute_error(yt, yp)
    # save our model, including scalers and feature agglomerator
    with open('RFR_trained_model.pickle', 'wb') as output:
        pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)

    return r2, rmse

Beispiel #2

0

Datei anzeigen

Datei: Clusterer.py Projekt: nsaef/text_exploration

    def test_feature_agglomoration(self, n, data, corpus, docvecs):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        affinities = ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"]
        linkages = ["ward", "complete", "average"]
        agglos = []

        for linkage in linkages:
            if linkage is "ward":
                agglos.append(FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage=linkage))
            else:
                for affinity in affinities:
                    agglos.append(FeatureAgglomeration(n_clusters=n, affinity=affinity, linkage=linkage))

        for agglo in agglos:
            print(agglo.get_params)
            reduced_vectors = agglo.fit_transform(data)

            clusters_kmeans = self.cluster_kmeans(docvecs, reduced_vectors=reduced_vectors, feature_names=corpus)
            labels_db = self.cluster_dbscan(reduced_vectors)
            #labels_hdb = self.cluster_hdbscan(reduced_vectors)
            clusters_db = self.get_clusters(corpus, labels_db)
            #clusters_hdb = self.get_clusters(corpus, labels_hdb)
        #agglo = FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage="ward")
        #return agglo.fit_transform(data)
        return

Beispiel #3

0

Datei anzeigen

Datei: regressor.py Projekt: agramfort/el_nino_ramp

class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)

Beispiel #4

0

Datei anzeigen

Datei: bias_variance.py Projekt: ridhijs/Patient-Health-Analysis

def FeatureSelection(df,numeric_cols , corrCoefThres=0.9):
    numeric_cols = numeric_cols
    numdf = df[numeric_cols]
    r_in_x = numdf.corr()
    r_in_x = abs(r_in_x)
    distance_in_x = 1 / r_in_x
    for i in range(r_in_x.shape[0]):
            distance_in_x.iloc[i, i] = 10 ^ 10


    cpdist = distance_in_x.copy()

    cpdist = cpdist.fillna(cpdist.max().max())
    #df.isna().sum()

    from scipy.spatial.distance import correlation
    from sklearn.cluster import FeatureAgglomeration

    corrcoefmin = corrCoefThres
    fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin)
    fa.fit(cpdist)

    numdf.shape[1]
    fa.n_clusters_

    fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_})

    selectedFeatures = fadf.groupby("label").head(1)["feature"].values
    return selectedFeatures

Beispiel #5

0

Datei anzeigen

Datei: test_feature_agglomeration.py Projekt: 745698140/test_1

def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.])
    assert Xt_median == np.array([0.])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)

Beispiel #6

0

Datei anzeigen

def feature_agglomeration(voters_data, n, rounding=False):
    featagg = FeatureAgglomeration(n_clusters=n)
    featagg.fit(voters_data)
    condensed = featagg.transform(voters_data)

    feature_groups_map = dict(zip(voters_data.columns, featagg.labels_))
    feature_groups_nos = []
    for feature_group_key in feature_groups_map:
        feature_groups_nos.append(feature_groups_map[feature_group_key])
    feature_groups_nos

    group_labels = []
    for feature_group_no in set(feature_groups_nos):
        group_label = ""
        for feature_groups_key in feature_groups_map:
            if feature_groups_map[feature_groups_key] == feature_group_no:
                group_label = group_label + feature_groups_key + ", "
        group_labels.append(group_label[0:-2])
    group_labels

    voters_agglomerated = pd.DataFrame(condensed,
                                       columns=group_labels,
                                       index=voters_data.index)
    if rounding == True:
        voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x))
    print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format(
        len(voters_data.columns), len(voters_agglomerated.columns)))
    return voters_agglomerated

Beispiel #7

0

Datei anzeigen

Datei: features_autoencoder.py Projekt: mrminy/deep_learning_exam

    def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False):
        compressions = []

        # Finding features
        diffnet = DiffNet(self.db, db_path=self.db_path)
        diffnet.restore(diffnet_paht)
        print("Calculating features for", len(cluster_images), "images")
        for img in cluster_images:
            print("Finding features for:", img)
            one_hot = diffnet.feedforward(img, cluster_db_path)
            output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.})
            compressions.append(output[0])

        # Clustering
        print("Performing clustering...")
        compressions = np.array(compressions)
        fa = FeatureAgglomeration(n_clusters=30)
        X_clusters = fa.fit_transform(compressions)

        print("Collecting data...")
        csv_dict_arr = []
        for i, img in enumerate(cluster_images):
            csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]})

        # Saving
        if save_csv:
            print("Saving data to csv...")
            keys = load_label_list(csv_dict_arr[0])
            with open('cluster_result.csv', 'w') as output_file:
                dict_writer = csv.DictWriter(output_file, keys, delimiter=';')
                dict_writer.writeheader()
                dict_writer.writerows(csv_dict_arr)

        return csv_dict_arr

Beispiel #8

0

Datei anzeigen

Datei: parcellations.py Projekt: KamalakerDadi/Data-Processing

def _feature_agglomeration_fit_method(data, n_parcels, connectivity, linkage):
    """Feature Agglomeration algorithm to fit on the data.

    Parameters
    ----------
    data : array_like, shape=(n_samples, n_voxels)
        Masked subjects data

    n_parcels : int
        Number of parcels to parcellate.

    connectivity : ndarray
        Connectivity matrix
        Defines for each feature the neighbouring features following a given
        structure of the data.

    linkage : str
        which linkage criterion to use.
        'ward' or 'linkage' or 'average'

    Returns
    -------
    labels : ndarray
        Labels to the data
    """
    ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity,
                                linkage=linkage)
    ward.fit(data)

    return ward.labels_

Beispiel #9

0

Datei anzeigen

def TestSGDRegression(df1):

    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand',
                    ['any_regressand', 'X25', 'X26', 'X29', 'X13'])

    # build our design matrices
    X = dmatrix(eqn.replace('regressand ~ ', '0+'),
                data=df1,
                return_type='dataframe')

    # load our model, including scalers and feature agglomerator
    with open('SGD_trained_model.pickle', 'rb') as input:
        res = pickle.load(input)

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X)
    reduced_X = X_reduction.transform(X)

    # standardize our data
    X_scaler = StandardScaler().fit(reduced_X)
    std_X = X_scaler.transform(reduced_X)

    # predict the interest rates
    yp = res.predict(std_X)

    return yp

Beispiel #10

0

Datei anzeigen

Datei: feature.py Projekt: sancicaXX/SklearnCustomization

def feature_agglomeration(X, args={}):
    """
    使用层次聚类对特征进行聚类，然后进行特征降维
    """
    from sklearn.cluster import FeatureAgglomeration
    fam = FeatureAgglomeration(**args)
    fam.fit(X)
    return fam

Beispiel #11

0

Datei anzeigen

class FeatureAgglomerationDecomposer(Transformer):
    type = 11

    def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean',
                 random_state=1):
        super().__init__("feature_agglomeration_decomposer")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = NUMERICAL

        self.n_clusters = n_clusters
        self.affinity = affinity
        self.linkage = linkage
        self.pooling_func = pooling_func
        self.random_state = random_state

        self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max)

    @ease_trans
    def operate(self, input_datanode, target_fields=None):
        from sklearn.cluster import FeatureAgglomeration

        X, y = input_datanode.data

        if self.model is None:
            self.n_clusters = int(self.n_clusters)

            n_clusters = min(self.n_clusters, X.shape[1])
            if not callable(self.pooling_func):
                self.pooling_func = self.pooling_func_mapping[self.pooling_func]

            self.model = FeatureAgglomeration(
                n_clusters=n_clusters, affinity=self.affinity,
                linkage=self.linkage, pooling_func=self.pooling_func)
            self.model.fit(X)

        X_new = self.model.transform(X)

        return X_new

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        cs = ConfigurationSpace()
        n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25)
        affinity = CategoricalHyperparameter(
            "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean")
        linkage = CategoricalHyperparameter(
            "linkage", ["ward", "complete", "average"], default_value="ward")
        pooling_func = CategoricalHyperparameter(
            "pooling_func", ["mean", "median", "max"], default_value="mean")

        cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func])

        affinity_and_linkage = ForbiddenAndConjunction(
            ForbiddenInClause(affinity, ["manhattan", "cosine"]),
            ForbiddenEqualsClause(linkage, "ward"))
        cs.add_forbidden_clause(affinity_and_linkage)
        return cs

Beispiel #12

0

Datei anzeigen

def untangle(X: Iterable,
             y: Iterable,
             n_clusters: int = None,
             get_connectivity: bool = True,
             compute_distances: bool = True,
             kind: str = 'correlation',
             agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration:

    from nilearn.connectome import ConnectivityMeasure as CM
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.covariance import LedoitWolf
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import mutual_info_classif

    agglo_defs = dict(affinity='euclidean',
                      compute_full_tree='auto',
                      linkage='ward',
                      pooling_func=np.mean,
                      distance_threshold=None,
                      compute_distances=compute_distances)

    if get_connectivity is True:
        connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0]
    else:
        connect_mat = None

    if n_clusters is None:
        n_clusters = divmod(X.shape[1], 2)[0] - 1
        if n_clusters == 0:
            n_clusters = 1

    if agglo_kws is None:
        agglo_kws = {}
    agglo_defs.update(agglo_kws)

    agglo = FeatureAgglomeration(n_clusters=n_clusters,
                                 connectivity=connect_mat,
                                 **agglo_defs)
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    agglo.fit(X, y)

    setattr(
        agglo, 'cluster_indexes_',
        pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_),
                     columns=['cluster',
                              'feature']).groupby('cluster').feature)

    skb = SelectKBest(k=1, score_func=mutual_info_classif)
    factor_leaders_ = [
        skb.fit(X[itm[1]], y).get_feature_names_out()[0]
        for itm in tuple(agglo.cluster_indexes_)
    ]
    setattr(agglo, 'factor_leaders_', factor_leaders_)
    return agglo

Beispiel #13

0

Datei anzeigen

Datei: cluster.py Projekt: jackred/CW1_GERMAN_SIGN

def featureagglomeration(data_train, data_test, label_train, label_test, args):
    print('feature agglomeration')
    FA = FeatureAgglomeration(n_clusters=10).fit(data_train)
    transformation = FA.transform(data_test)
    agglomeration = find_highest(transformation)
    print('feature agglomeration done')
    compare_class(agglomeration, label_test)
    if args.create_mean:
        create_images_from_rows('fa', mean_image(agglomeration, data_test))

Beispiel #14

0

Datei anzeigen

Datei: test_featureagglomerationencoder.py Projekt: shakurshams/jina-hub

def get_encoder(metas, train_data, target_output_dim):
    tmpdir = metas['workspace']
    model_path = os.path.join(tmpdir, 'feature_agglomeration.model')

    model = FeatureAgglomeration(n_clusters=target_output_dim)
    model.fit(train_data)
    pickle.dump(model, open(model_path, 'wb'))

    return FeatureAgglomerationEncoder(model_path=model_path)

Beispiel #15

0

Datei anzeigen

Datei: Clusterer.py Projekt: nsaef/text_exploration

    def do_feature_agglomoration(self, data):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        if self.k:
            n = self.k
        else:
            n = 20

        agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete")
        return agglo.fit_transform(data)

Beispiel #16

0

Datei anzeigen

Datei: text_processing.py Projekt: jaimeoliver1/Auto-text-mining

    def token_cluster(self, n_clusters=300):

        from scipy import sparse
        from sklearn.cluster import FeatureAgglomeration

        FA = FeatureAgglomeration(n_clusters=3000)

        self.bow_corpus = FA.fit_transform(self.bow_corpus)
        self.bow_corpus = sparse.csr_matrix(self.bow_corpus)

Beispiel #17

0

Datei anzeigen

def get_clusters(X: pd.DataFrame, n_clusters: int):
    clt = FeatureAgglomeration(n_clusters=n_clusters)
    clt.fit(X)

    clusters = []

    for i in range(n_clusters):
        clusters.append(X.columns[clt.labels_ == i].tolist())

    return clusters  # type: list[str]

Beispiel #18

0

Datei anzeigen

Datei: dimReduction.py Projekt: jain15mayank/validating-clustering-frameworks

def dim_reduction_FA(data, distance_threshold=0.45):
    """
    Params:
        data: ndarry of shape (n_samples, n_features)
        distance_threshold: Optimal threshold value for similarity measure
    Returns: (reducedDimData, nReducedComponents)
    """
    agglo = FeatureAgglomeration(n_clusters=None,distance_threshold=distance_threshold)
    reducedDimData = agglo.fit_transform(data)
    return reducedDimData, agglo.n_clusters_

Beispiel #19

0

Datei anzeigen

Datei: data_reduction.py Projekt: davips/automl-paje_archived

def apply_feature_agglomeration(table, features, label, n_components):
    from sklearn.cluster import FeatureAgglomeration
    from paje import feature_file_processor

    x, y = feature_file_processor.split_features_target(table, features, label)

    fa = FeatureAgglomeration(n_clusters=n_components, linkage='ward')
    pc = fa.fit_transform(x)

    return feature_file_processor.generate_data_frame(pc, table[[label]])

Beispiel #20

0

Datei anzeigen

def test_feature_agglomeration_feature_names_out():
    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
    X, _ = make_blobs(n_features=6, random_state=0)
    agglo = FeatureAgglomeration(n_clusters=3)
    agglo.fit(X)
    n_clusters = agglo.n_clusters_

    names_out = agglo.get_feature_names_out()
    assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)],
                       names_out)

Beispiel #21

0

Datei anzeigen

Datei: 6-4-rf-fa.py Projekt: tinaavbelj/tourism-recommender-system

def main():

    # Parameters
    data_directory = '../../data/generated-data-r-10-n-6-4/'
    features_path = '../../data/features-generated-data-r-10-n-6-4'
    booking_file = '../../data/booking.csv'
    users_file = '../../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5]
    false_objects_indexes = [6, 7, 8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores_auc = []
    scores_rmse = []
    for i in range(10):
        cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='rf')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv'
    save_scores(scores_rmse, results_file)

Beispiel #22

0

Datei anzeigen

def getDR(dt_all, n_comp=12):
    # cols
    cols_encode_label = dt_all.filter(
        regex="Encode_Label").columns.values.tolist()
    cols_cat = dt_all.drop(
        "ID", axis=1).select_dtypes(include=["object"]).columns.tolist()

    # standardize
    dt_all_norm = MinMaxScaler().fit_transform(
        dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1))

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(dt_all_norm)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(dt_all_norm)

    # ICA
    ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420)
    ica_results = ica.fit_transform(dt_all_norm)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results = grp.fit_transform(dt_all_norm)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results = srp.fit_transform(dt_all_norm)

    # NMF
    nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
    nmf_results = nmf.fit_transform(dt_all_norm)

    # F*G
    f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
    fag_results = f*g.fit_transform(dt_all_norm)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1]
        dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1]
        dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1]
        dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1]
        dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1]
        dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1]
        dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1]

    return (dt_all)

Beispiel #23

0

Datei anzeigen

    def setup(self, keywords={}):
        """
        Setup the algorithms
        """
        for p in keywords.keys():
            setattr(self, p, keywords[p])

        if self.method == "agglomerative": self.obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage, 
                affinity=self.affinity)
        if self.method == "feature": self.obj = FeatureAgglomeration(n_clusters=self.n_clusters, linkage=self.linkage,
                                affinity=self.affinity, distance_threshold=self.distance_threshold)
        return

Beispiel #24

0

Datei anzeigen

 def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15):
     X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df)
     agglo = FeatureAgglomeration(n_clusters=n_clusters)
     if len(X_transformed) > 20000:
         X_agglo = X_transformed.sample(20000)
     else:
         X_agglo = X_transformed.copy()
     agglo.fit(X_agglo)
     vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(),
                                        'cluster':list(agglo.labels_)})\
                       .sort_values('cluster')
     return vars_clusters, X_transformed

Beispiel #25

0

Datei anzeigen

 def cont_feature_clusters_sklearn(self, n_clusters = 5):
     """ This uses feature agglomeration from scikit learn and only works for continuous variables
         Eventually expand this to categorical variables using Cramer's V covariance matrix similar to 
         R tool using the iclust package """   
         
     #Import the library
     from sklearn.cluster import FeatureAgglomeration
     
     Cluster = FeatureAgglomeration(n_clusters=n_clusters)
     Cluster.fit(self._dataset.iloc[:,self._cont_index_predictors])
     
     df = pd.DataFrame({'Variable':self._dataset.columns[self._cont_index_predictors], 'Cluster':Cluster.labels_})
     
     return df.sort_values(by='Cluster')

Beispiel #26

0

Datei anzeigen

def fa_dim_red(x_train_scaled, dataset_name, features_num = 2):
    z=0
    losses = []
    for k in range(1, x_train_scaled.shape[1]+1):
        fa = FeatureAgglomeration(n_clusters=k)
        fa_result = fa.fit_transform(x_train_scaled)
        x_projected_fa = fa.inverse_transform(fa_result)
        loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
        losses.append(loss)
            
    np_feature_losses_percent = np.multiply(100, losses/np.sum(losses))
    print('num of clustrs < 10% loss')
    for i in range(len(np_feature_losses_percent)):
        z=z+np_feature_losses_percent[i]
        if z>90:
            print(i+1)
            break
    print(np_feature_losses_percent)
    plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent)
    plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error (% of Total)")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' fa analysis.png')
    plt.show()

    fa = FeatureAgglomeration(n_clusters=features_num)
    fa_result = fa.fit_transform(x_train_scaled, y_train)
    print(fa_result.shape)
    x_projected_fa = fa.inverse_transform(fa_result)
    print(x_projected_ica.shape)
    print(x_train_scaled.shape)
    loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
    print('loss')
    print(loss)
    return fa_result,x_projected_fa

Beispiel #27

0

Datei anzeigen

Datei: rpbi.py Projekt: nilearn/nilearn_sandbox

def _ward_fit_transform(all_subjects_data, fit_samples_indices,
                        connectivity, n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels

Beispiel #28

0

Datei anzeigen

def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity,
                        n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels

Beispiel #29

0

Datei anzeigen

Datei: test_feature_agglomeration.py Projekt: MartinThoma/scikit-learn

def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters)
    assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters)
    assert_true(np.size(agglo_mean.labels_) == X.shape[1])
    assert_true(np.size(agglo_median.labels_) == X.shape[1])

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert_true(Xt_mean.shape[1] == n_clusters)
    assert_true(Xt_median.shape[1] == n_clusters)
    assert_true(Xt_mean == np.array([1 / 3.]))
    assert_true(Xt_median == np.array([0.]))

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert_true(np.unique(X_full_mean[0]).size == n_clusters)
    assert_true(np.unique(X_full_median[0]).size == n_clusters)

    assert_array_almost_equal(agglo_mean.transform(X_full_mean),
                              Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median),
                              Xt_median)

Beispiel #30

0

Datei anzeigen

def cluster_sentences(sentences, nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                       stop_words=stopwords.words('english'),
                                       max_df=0.9,
                                       min_df=0.05,
                                       lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences)
    tfidf_matrix = tfidf_matrix_1.todense()
    kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)
    return dict(clusters)

Beispiel #31

0

Datei anzeigen

def data_compression(fmri_masked, mask_img, mask_np, output_size):
    """
    data : array_like
         A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
         The functional dataset that needs to be reduced
    mask : a numpy array of the mask
    output_size : integer
        The number of elements that the data should be reduced to
        
    """

    ## Transform nifti files to a data matrix with the NiftiMasker
    import time
    from nilearn import input_data

    datacompressiontime = time.time()
    nifti_masker = input_data.NiftiMasker(mask_img=mask_img,
                                          memory='nilearn_cache',
                                          mask_strategy='background',
                                          memory_level=1,
                                          standardize=False)

    ward = []

    # Perform Ward clustering
    from sklearn.feature_extraction import image
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    #import pdb;pdb.set_trace()
    from sklearn.cluster import FeatureAgglomeration
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=output_size,
                                connectivity=connectivity,
                                linkage='ward')
    ward.fit(fmri_masked)
    #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))

    labels = ward.labels_

    #print ('Extracting reduced Dimension Data')
    data_reduced = ward.transform(fmri_masked)
    fmri_masked = []
    #print('Data compression took ', (time.time()- datacompressiontime), ' seconds')
    return {'data': data_reduced, 'labels': labels}

Beispiel #32

0

Datei anzeigen

Datei: test_hierarchical.py Projekt: zergioz/scikit-learn

def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    with pytest.raises(ValueError):
        AgglomerativeClustering(linkage='foo').fit(X)

    with pytest.raises(ValueError):
        linkage_tree(X, linkage='foo')

    with pytest.raises(ValueError):
        linkage_tree(X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])

Beispiel #33

0

Datei anzeigen

    def comput_coefs(self, X, y, size):
        cv = KFold(2)  # cross-validation generator for model selection
        ridge = BayesianRidge()
        cachedir = tempfile.mkdtemp()
        mem = Memory(cachedir=cachedir, verbose=1)

        # Ward agglomeration followed by BayesianRidge
        connectivity = grid_to_graph(n_x=size, n_y=size)
        ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                                    memory=mem)
        clf = Pipeline([('ward', ward), ('ridge', ridge)])
        # Select the optimal number of parcels with grid search
        clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
        coef_agglomeration_ = coef_.reshape(size, size)

        # Anova univariate feature selection followed by BayesianRidge
        f_regression = mem.cache(feature_selection.f_regression)  # caching function
        anova = feature_selection.SelectPercentile(f_regression)
        clf = Pipeline([('anova', anova), ('ridge', ridge)])
        # Select the optimal percentage of features with grid search
        clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
        coef_selection_ = coef_.reshape(size, size)
        return dict(
            coef_selection_=coef_selection_,
            coef_agglomeration_=coef_agglomeration_,
            cachedir=cachedir
        )

Beispiel #34

0

Datei anzeigen

Datei: model_wrapper.py Projekt: gkiar/aggregate_models

def createPipe(embed, classif, nmca, aggregation, nsubs):
    # Dimension Reduction
    n_comp = 20 if nsubs > 70 else 15
    if embed == "pca":
        emb = ('pca', PCA(n_components=n_comp))
    else:
        emb = ('fa', FeatureAgglomeration(n_clusters=n_comp))

    # Classifiers
    neib = int(nmca * nsubs * 0.1) if aggregation == "mega" else int(nsubs *
                                                                     0.1)
    clfs = {
        'svc':
        ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e6)),
        'knn': ('knn', KNeighborsClassifier(n_neighbors=neib)),
        'rfc': ('rfc', RandomForestClassifier(class_weight="balanced")),
        'ada': ('ada', AdaBoostClassifier()),
        'lrc': ('lrc',
                LogisticRegression(class_weight="balanced",
                                   solver='liblinear',
                                   max_iter=1e6))
    }

    pipe = Pipeline(steps=[emb, clfs[classif]])
    return pipe

Beispiel #35

0

Datei anzeigen

Datei: test_hierarchical.py Projekt: DearMonster/nb_sklearn

def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])

Beispiel #36

0

Datei anzeigen

Datei: test_hierarchical.py Projekt: 0x0all/scikit-learn

def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])

Beispiel #37

0

Datei anzeigen

Datei: learning.py Projekt: EmanuelGoncalves/dream

def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features

Beispiel #38

0

Datei anzeigen

Datei: regressor.py Projekt: agramfort/el_nino_ramp

 def __init__(self):
     self.clf = Pipeline([
         ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                      n_jobs=N_JOBS))])
     self.scaler = StandardScaler()
     self.agglo = FeatureAgglomeration(n_clusters=500)

Beispiel #39

0

Datei anzeigen

Datei: cluster_cells.py Projekt: ldarrick/summer2016_research

# Transpose and scale parameters
featListOriginal[[0,1,5,7,9,10],:] = featListOriginal[[0,1,5,7,9,10],:]*0.8
featListOriginal[[2,3,4],:] = featListOriginal[[2,3,4],:]*0.8*0.2
featListOriginal[6,:] = featListOriginal[6,:]*0.8*0.8
featListOriginal = NP.transpose(featListOriginal)

## STANDARDIZE FEATURES ###################################################

# Don't standardize the centroids
for k in range(2,numFeat):
	featList[k] = (featList[k] - NP.mean(featList[k]))/NP.sqrt(NP.var(featList[k]))

# Transpose the feature list to use in clustering
featList = NP.transpose(featList)
feat_aggl = FeatureAgglomeration(2)
feat_aggl.fit(featList[:,2:])

## AGGLOMERATIVE CLUSTERING ###############################################

aggl_all = AgglomerativeClustering(2)
X_All = featList[:,2:]
y2 = aggl_all.fit_predict(X_All)

## PCA ###############################################################

pca_model = PCA(2)
X_PCA = pca_model.fit_transform(X_All)
print(pca_model.explained_variance_ratio_)

## SPLIT INTO numBINS ###############################################

Beispiel #40

0

Datei anzeigen

Datei: featurecluster.py Projekt: ldarrick/summer2016_research

	def FeatureAgglomeration(self, clist, numClusters=2):
		FEATAGGL = FeatureAgglomeration(numClusters)
		FEATAGGL.fit(self.featList[:,clist])
		self.featureTree = FEATAGGL.children_	
		self.featureLabels = FEATAGGL.labels_
		self.featureCList = clist

Beispiel #41

0

Datei anzeigen

Datei: ROC-curve.py Projekt: jche/GumGum

from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import FeatureAgglomeration
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import KFold
from sklearn import metrics

###############################################################################
# Data IO and generation
# import some data to play with
#file = "/home/kbhalla/Desktop/Data/day_samp_new.npy"
file = "/home/rmendoza/Documents/Data/day_samp_new_0604.npy"
with open(file, "r") as file_in:
        matrix = smio.load_sparse_csr(file_in)

X = matrix[:,:-1]
FA = FeatureAgglomeration(n_clusters=250)
print np.shape(X)
y = matrix[:,-1]
X = FA.fit_transform(X,y)
n_samples, n_features = X.shape
k = int(0.8*n_samples)
#random_state = np.random.RandomState(0)
#X = np.c_[X, random_state.randn(n_samples, 2*n_features)]
X_test, y_test = X[k:,:], y[k:]
X, y = X[:k, :], y[:k]
sm = SMOTE(ratio=0.95)
X,y = sm.fit_sample(X, y)
print np.shape(X)
start = time.time()

Beispiel #42

0

Datei anzeigen

Datei: plot_rest_clustering.py Projekt: AlexandreAbraham/nilearn


##################################################################
# Then we use FeatureAgglomeration from scikit-learn. Indeed, the voxels
# are the features of the data matrix.
#
# In addition, we use caching. As a result, the clustering doesn't have
# to be recomputed later.

# Computing the ward for the first time, this is long...
from sklearn.cluster import FeatureAgglomeration
# If you have scikit-learn older than 0.14, you need to import
# WardAgglomeration instead of FeatureAgglomeration
import time
start = time.time()
ward = FeatureAgglomeration(n_clusters=1000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 1000 clusters: %.2fs" % (time.time() - start))

# Compute the ward with more clusters, should be faster as we are using
# the caching mechanism
start = time.time()
ward = FeatureAgglomeration(n_clusters=2000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 2000 clusters: %.2fs" % (time.time() - start))

##################################################################
# Visualize results
# ------------------
#

Beispiel #43

0

Datei anzeigen

Datei: SAT_feature_agg.py Projekt: AntoninGarret/SAT_classification

    
def find_cluster(childrens, start, height, sample_size):
	res = start
	for i in range(height - 1):
		res = find_feature(childrens, res + sample_size)
	cluster = rec_cluster(childrens, childrens[res], sample_size)
	cluster.sort()
	return cluster
	
def find_feature_cluster(children, feature, height, sample_size):
	return find_cluster(children, find_feature(children, feature), height, sample_size)
		


BENCH_DATA = genfromtxt('Sequential_Application_SATUNSAT_track_wo_names.csv', delimiter=',')
#BENCH_DATA = BENCH_DATA.transpose()
print(BENCH_DATA.shape)
#print(np.isnan(BENCH_DATA).any())

ward = FeatureAgglomeration(linkage='average')

#print(ward.fit_predict(BENCH_DATA))
ward.fit(BENCH_DATA)
#print(ward.children_)
#print(find_feature_cluster(ward.children_, 0, 2, 300))

plt.title('SAT Feature_Agglomeration')
plot_dendrogram(ward, leaf_font_size = 12)
#plt.savefig('SAT_Feature_Agglomeration.png')
plt.show()