Exemple #1
0
def fa_dim_red(x_train_scaled, dataset_name, features_num = 2):
    z=0
    losses = []
    for k in range(1, x_train_scaled.shape[1]+1):
        fa = FeatureAgglomeration(n_clusters=k)
        fa_result = fa.fit_transform(x_train_scaled)
        x_projected_fa = fa.inverse_transform(fa_result)
        loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
        losses.append(loss)
            
    np_feature_losses_percent = np.multiply(100, losses/np.sum(losses))
    print('num of clustrs < 10% loss')
    for i in range(len(np_feature_losses_percent)):
        z=z+np_feature_losses_percent[i]
        if z>90:
            print(i+1)
            break
    print(np_feature_losses_percent)
    plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent)
    plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error (% of Total)")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' fa analysis.png')
    plt.show()

    fa = FeatureAgglomeration(n_clusters=features_num)
    fa_result = fa.fit_transform(x_train_scaled, y_train)
    print(fa_result.shape)
    x_projected_fa = fa.inverse_transform(fa_result)
    print(x_projected_ica.shape)
    print(x_train_scaled.shape)
    loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
    print('loss')
    print(loss)
    return fa_result,x_projected_fa
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.])
    assert Xt_median == np.array([0.])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
Exemple #3
0
    def test_feature_agglomoration(self, n, data, corpus, docvecs):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        affinities = ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"]
        linkages = ["ward", "complete", "average"]
        agglos = []

        for linkage in linkages:
            if linkage is "ward":
                agglos.append(FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage=linkage))
            else:
                for affinity in affinities:
                    agglos.append(FeatureAgglomeration(n_clusters=n, affinity=affinity, linkage=linkage))

        for agglo in agglos:
            print(agglo.get_params)
            reduced_vectors = agglo.fit_transform(data)

            clusters_kmeans = self.cluster_kmeans(docvecs, reduced_vectors=reduced_vectors, feature_names=corpus)
            labels_db = self.cluster_dbscan(reduced_vectors)
            #labels_hdb = self.cluster_hdbscan(reduced_vectors)
            clusters_db = self.get_clusters(corpus, labels_db)
            #clusters_hdb = self.get_clusters(corpus, labels_hdb)
        #agglo = FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage="ward")
        #return agglo.fit_transform(data)
        return
Exemple #4
0
def TrainRFRegression(df1):
    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29'])

    # build our design matrices
    y, X = dmatrices(eqn, data=df1, return_type='dataframe')

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y))
    reduced_X = X_reduction.transform(X)

    # define our regressor
    mod = RandomForestRegressor(n_estimators=50)

    # fit our data
    res = mod.fit(reduced_X, pd.np.ravel(y))

    # evaluate our fit
    yp = pd.DataFrame({'predicted': res.predict(reduced_X)})
    yp = yp['predicted']
    yt = y['regressand']
    r2 = metrics.r2_score(yt, yp)
    rmse = metrics.mean_absolute_error(yt, yp)
    # save our model, including scalers and feature agglomerator
    with open('RFR_trained_model.pickle', 'wb') as output:
        pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)

    return r2, rmse
def test_linkage_misc():
    # Misc tests on linkage
    rnd = np.random.RandomState(42)
    X = rnd.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # Deprecation of Ward class
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        Ward().fit(X)
    assert_equal(len(warning_list), 1)

    # test hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hiearchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
Exemple #7
0
def test_linkage_misc():
    # Misc tests on linkage
    X = np.ones((5, 5))
    assert_raises(ValueError,
                  AgglomerativeClustering(linkage='foobar').fit,
                  X)
    assert_raises(ValueError, linkage_tree, X, linkage='foobar')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        Ward(copy=True).fit(X)
    # We should be getting 2 warnings: one for using Ward that is
    # deprecated, one for using the copy argument
    assert_equal(len(warning_list), 2)

    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", UserWarning)
        # Use the copy argument, to raise a warning
        ward_tree(X, copy=True)
    # We should be getting 1 warnings: for using the copy argument
    assert_equal(len(warning_list), 1)

    # Let's test a hiearchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)
    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def Reduction(X_train, alg_name):
    train_scores=[]
    test_scores=[]
    times=[]
    print('algorithm:', alg_name)
    for component in range(1, len(X_train[0])+1):
        if component % 10 == 0:
            print(component)
        if alg_name == 'pca':
            alg = PCA(n_components=component, random_state=1)
        elif alg_name == 'ica':
            alg = FastICA(random_state=1, n_components=component)
        elif alg_name == 'rp':
            alg = GaussianRandomProjection(n_components=component, random_state=1)
        elif alg_name == 'fa':
            alg = FeatureAgglomeration(n_clusters=component)
        else:
            break
        
        X_train_reduced=alg.fit_transform(X_train)
        X_test_reduced=alg.transform(X_test)
        
        start_time = time.time()
        train_score, test_score = NN(X_train_reduced, X_test_reduced)
        times.append((time.time() - start_time))
        
        train_scores.append(train_score)
        test_scores.append(test_score)

    return train_scores, test_scores, times
Exemple #10
0
def feature_agglomeration(voters_data, n, rounding=False):
    featagg = FeatureAgglomeration(n_clusters=n)
    featagg.fit(voters_data)
    condensed = featagg.transform(voters_data)

    feature_groups_map = dict(zip(voters_data.columns, featagg.labels_))
    feature_groups_nos = []
    for feature_group_key in feature_groups_map:
        feature_groups_nos.append(feature_groups_map[feature_group_key])
    feature_groups_nos

    group_labels = []
    for feature_group_no in set(feature_groups_nos):
        group_label = ""
        for feature_groups_key in feature_groups_map:
            if feature_groups_map[feature_groups_key] == feature_group_no:
                group_label = group_label + feature_groups_key + ", "
        group_labels.append(group_label[0:-2])
    group_labels

    voters_agglomerated = pd.DataFrame(condensed,
                                       columns=group_labels,
                                       index=voters_data.index)
    if rounding == True:
        voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x))
    print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format(
        len(voters_data.columns), len(voters_agglomerated.columns)))
    return voters_agglomerated
Exemple #11
0
def TestSGDRegression(df1):

    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand',
                    ['any_regressand', 'X25', 'X26', 'X29', 'X13'])

    # build our design matrices
    X = dmatrix(eqn.replace('regressand ~ ', '0+'),
                data=df1,
                return_type='dataframe')

    # load our model, including scalers and feature agglomerator
    with open('SGD_trained_model.pickle', 'rb') as input:
        res = pickle.load(input)

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X)
    reduced_X = X_reduction.transform(X)

    # standardize our data
    X_scaler = StandardScaler().fit(reduced_X)
    std_X = X_scaler.transform(reduced_X)

    # predict the interest rates
    yp = res.predict(std_X)

    return yp
Exemple #12
0
    def to_sklearn(self, n_samples: int = 0, n_features: int = 0, **kwargs):
        from sklearn.cluster import FeatureAgglomeration

        if self.pooling_func == "mean":
            pooling_func = np.mean
        elif self.pooling_func == "median":
            pooling_func = np.median
        elif self.pooling_func == "max":
            pooling_func = np.max
        else:
            raise ValueError(f'Unknown pooling function \'{self.pooling_func}\'')

        if self.distance_threshold is not None:
            n_clusters = None
            self.compute_full_tree = True
        else:
            if isinstance(self.n_clusters_factor, int):
                n_clusters = self.n_clusters_factor
            else:
                n_clusters = max(min(resolve_factor(self.n_clusters_factor, n_features, default=2, cs_default=1.),
                                     (n_features - 1)), 2)

        return FeatureAgglomeration(n_clusters=n_clusters,
                                    affinity=self.affinity,
                                    compute_full_tree=self.compute_full_tree,
                                    linkage=self.linkage,
                                    distance_threshold=self.distance_threshold,
                                    pooling_func=pooling_func)
Exemple #13
0
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False):
    tformFile = './data/transformData.pkl'
    if bertFeats is None:
        with open(tformFile, 'rb') as ifile:
            dataMap = pkl.load(ifile)
    else:
        print('Regenerating transform data...')
        dataMap = {
            'PCA':
            PCA().fit_transform(bertFeats),
            'TSNE':
            TSNE().fit_transform(bertFeats),
            'Agglomeration':
            FeatureAgglomeration().fit_transform(bertFeats),
            'Gaussian Projection':
            random_projection.GaussianRandomProjection(2).fit_transform(
                bertFeats),
            'Sparse Projection':
            random_projection.SparseRandomProjection(2).fit_transform(
                bertFeats)
        }
        with open(tformFile, 'wb') as ofile:
            pkl.dump(dataMap, ofile)

    for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'):
        for tform in dataMap:
            tfData = dataMap[tform]
            grid = makeDataPlots(tfData, sarcasmDf, *combo, tform)
            if show:
                grid.show()
            title = grid.windowTitle()
            saveGrid(grid, imgDir / f'{title}.jpg')
    def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False):
        compressions = []

        # Finding features
        diffnet = DiffNet(self.db, db_path=self.db_path)
        diffnet.restore(diffnet_paht)
        print("Calculating features for", len(cluster_images), "images")
        for img in cluster_images:
            print("Finding features for:", img)
            one_hot = diffnet.feedforward(img, cluster_db_path)
            output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.})
            compressions.append(output[0])

        # Clustering
        print("Performing clustering...")
        compressions = np.array(compressions)
        fa = FeatureAgglomeration(n_clusters=30)
        X_clusters = fa.fit_transform(compressions)

        print("Collecting data...")
        csv_dict_arr = []
        for i, img in enumerate(cluster_images):
            csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]})

        # Saving
        if save_csv:
            print("Saving data to csv...")
            keys = load_label_list(csv_dict_arr[0])
            with open('cluster_result.csv', 'w') as output_file:
                dict_writer = csv.DictWriter(output_file, keys, delimiter=';')
                dict_writer.writeheader()
                dict_writer.writerows(csv_dict_arr)

        return csv_dict_arr
Exemple #15
0
    def get_transform(algorithm):
        """
        Defines and returns a feature selection transform object of the designated type.

        Parameters
        ----------
        algorithm : {'pca', 'kpca', 'grp', 'fa', 'k_best'}
            Transform algorithm to return an object.

        Returns
        ----------
        transform : object
            Instantiated transform object.
        """
        if algorithm == 'pca':
            transform = PCA()
        elif algorithm == 'kpca':
            transform = KernelPCA()
        elif algorithm == 'grp':
            transform = GaussianRandomProjection()
        elif algorithm == 'fa':
            transform = FeatureAgglomeration()
        elif algorithm == 'k_best':
            transform = SelectKBest(mutual_info_regression)
        else:
            raise Exception(
                'No selection algorithm defined for {0}'.format(algorithm))

        return transform
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9):
    numeric_cols = numeric_cols
    numdf = df[numeric_cols]
    r_in_x = numdf.corr()
    r_in_x = abs(r_in_x)
    distance_in_x = 1 / r_in_x
    for i in range(r_in_x.shape[0]):
            distance_in_x.iloc[i, i] = 10 ^ 10


    cpdist = distance_in_x.copy()

    cpdist = cpdist.fillna(cpdist.max().max())
    #df.isna().sum()

    from scipy.spatial.distance import correlation
    from sklearn.cluster import FeatureAgglomeration

    corrcoefmin = corrCoefThres
    fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin)
    fa.fit(cpdist)

    numdf.shape[1]
    fa.n_clusters_

    fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_})

    selectedFeatures = fadf.groupby("label").head(1)["feature"].values
    return selectedFeatures
Exemple #17
0
 def feature_agg(self, Data, size=5):
     clus = FeatureAgglomeration(n_clusters=size).fit(np.array(scale(Data)))
     self.features_clusters = []
     self.features_name = list(Data)
     for i in range(size):
         self.features_clusters.append(
             np.array(self.features_name)[np.where(clus.labels_ == i)])
def createPipe(embed, classif, nmca, aggregation, nsubs):
    # Dimension Reduction
    n_comp = 20 if nsubs > 70 else 15
    if embed == "pca":
        emb = ('pca', PCA(n_components=n_comp))
    else:
        emb = ('fa', FeatureAgglomeration(n_clusters=n_comp))

    # Classifiers
    neib = int(nmca * nsubs * 0.1) if aggregation == "mega" else int(nsubs *
                                                                     0.1)
    clfs = {
        'svc':
        ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e6)),
        'knn': ('knn', KNeighborsClassifier(n_neighbors=neib)),
        'rfc': ('rfc', RandomForestClassifier(class_weight="balanced")),
        'ada': ('ada', AdaBoostClassifier()),
        'lrc': ('lrc',
                LogisticRegression(class_weight="balanced",
                                   solver='liblinear',
                                   max_iter=1e6))
    }

    pipe = Pipeline(steps=[emb, clfs[classif]])
    return pipe
Exemple #19
0
 def build_impl(self):
     newconfig = self.config.copy()
     if newconfig['linkage'] == 'ward':
         newconfig['affinity'] = 'euclidean'
     newconfig['n_clusters'] = newconfig.pop(
         'n_components')  # Replace key name.
     self.model = FeatureAgglomeration(**newconfig)
Exemple #20
0
    def comput_coefs(self, X, y, size):
        cv = KFold(2)  # cross-validation generator for model selection
        ridge = BayesianRidge()
        cachedir = tempfile.mkdtemp()
        mem = Memory(cachedir=cachedir, verbose=1)

        # Ward agglomeration followed by BayesianRidge
        connectivity = grid_to_graph(n_x=size, n_y=size)
        ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                                    memory=mem)
        clf = Pipeline([('ward', ward), ('ridge', ridge)])
        # Select the optimal number of parcels with grid search
        clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
        coef_agglomeration_ = coef_.reshape(size, size)

        # Anova univariate feature selection followed by BayesianRidge
        f_regression = mem.cache(feature_selection.f_regression)  # caching function
        anova = feature_selection.SelectPercentile(f_regression)
        clf = Pipeline([('anova', anova), ('ridge', ridge)])
        # Select the optimal percentage of features with grid search
        clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
        coef_selection_ = coef_.reshape(size, size)
        return dict(
            coef_selection_=coef_selection_,
            coef_agglomeration_=coef_agglomeration_,
            cachedir=cachedir
        )
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    with pytest.raises(ValueError):
        AgglomerativeClustering(linkage='foo').fit(X)

    with pytest.raises(ValueError):
        linkage_tree(X, linkage='foo')

    with pytest.raises(ValueError):
        linkage_tree(X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def feature_agglomeration(X, args={}):
    """
    使用层次聚类对特征进行聚类,然后进行特征降维
    """
    from sklearn.cluster import FeatureAgglomeration
    fam = FeatureAgglomeration(**args)
    fam.fit(X)
    return fam
Exemple #23
0
def untangle(X: Iterable,
             y: Iterable,
             n_clusters: int = None,
             get_connectivity: bool = True,
             compute_distances: bool = True,
             kind: str = 'correlation',
             agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration:

    from nilearn.connectome import ConnectivityMeasure as CM
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.covariance import LedoitWolf
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import mutual_info_classif

    agglo_defs = dict(affinity='euclidean',
                      compute_full_tree='auto',
                      linkage='ward',
                      pooling_func=np.mean,
                      distance_threshold=None,
                      compute_distances=compute_distances)

    if get_connectivity is True:
        connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0]
    else:
        connect_mat = None

    if n_clusters is None:
        n_clusters = divmod(X.shape[1], 2)[0] - 1
        if n_clusters == 0:
            n_clusters = 1

    if agglo_kws is None:
        agglo_kws = {}
    agglo_defs.update(agglo_kws)

    agglo = FeatureAgglomeration(n_clusters=n_clusters,
                                 connectivity=connect_mat,
                                 **agglo_defs)
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    agglo.fit(X, y)

    setattr(
        agglo, 'cluster_indexes_',
        pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_),
                     columns=['cluster',
                              'feature']).groupby('cluster').feature)

    skb = SelectKBest(k=1, score_func=mutual_info_classif)
    factor_leaders_ = [
        skb.fit(X[itm[1]], y).get_feature_names_out()[0]
        for itm in tuple(agglo.cluster_indexes_)
    ]
    setattr(agglo, 'factor_leaders_', factor_leaders_)
    return agglo
Exemple #24
0
def featureagglomeration(data_train, data_test, label_train, label_test, args):
    print('feature agglomeration')
    FA = FeatureAgglomeration(n_clusters=10).fit(data_train)
    transformation = FA.transform(data_test)
    agglomeration = find_highest(transformation)
    print('feature agglomeration done')
    compare_class(agglomeration, label_test)
    if args.create_mean:
        create_images_from_rows('fa', mean_image(agglomeration, data_test))
    def token_cluster(self, n_clusters=300):

        from scipy import sparse
        from sklearn.cluster import FeatureAgglomeration

        FA = FeatureAgglomeration(n_clusters=3000)

        self.bow_corpus = FA.fit_transform(self.bow_corpus)
        self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
def get_encoder(metas, train_data, target_output_dim):
    tmpdir = metas['workspace']
    model_path = os.path.join(tmpdir, 'feature_agglomeration.model')

    model = FeatureAgglomeration(n_clusters=target_output_dim)
    model.fit(train_data)
    pickle.dump(model, open(model_path, 'wb'))

    return FeatureAgglomerationEncoder(model_path=model_path)
Exemple #27
0
    def do_feature_agglomoration(self, data):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        if self.k:
            n = self.k
        else:
            n = 20

        agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete")
        return agglo.fit_transform(data)
Exemple #28
0
def agglo_fn(X):
    from sklearn.cluster import FeatureAgglomeration
    import pandas as pd
    import matplotlib.pyplot as plt
    if X.shape != (7501, 6):
        X = np.transpose(X)

    agglo = FeatureAgglomeration(n_clusters=1).fit_transform(X)
    return agglo
Exemple #29
0
def test_feature_agglomeration_feature_names_out():
    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
    X, _ = make_blobs(n_features=6, random_state=0)
    agglo = FeatureAgglomeration(n_clusters=3)
    agglo.fit(X)
    n_clusters = agglo.n_clusters_

    names_out = agglo.get_feature_names_out()
    assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)],
                       names_out)
Exemple #30
0
 def makePipeline(self, classifier, n_clusters):
     """Makes a pipeline, necessary for adding in unsupervised learning 
         preprocessing step. 
     """
     estimators = [('reduce_dim',
                    FeatureAgglomeration(n_clusters=n_clusters,
                                         affinity='euclidean')),
                   ('main_classifier', classifier)]
     clf = Pipeline(estimators)
     return clf