Beispiel #1
0
def TrainRFRegression(df1):
    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29'])

    # build our design matrices
    y, X = dmatrices(eqn, data=df1, return_type='dataframe')

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y))
    reduced_X = X_reduction.transform(X)

    # define our regressor
    mod = RandomForestRegressor(n_estimators=50)

    # fit our data
    res = mod.fit(reduced_X, pd.np.ravel(y))

    # evaluate our fit
    yp = pd.DataFrame({'predicted': res.predict(reduced_X)})
    yp = yp['predicted']
    yt = y['regressand']
    r2 = metrics.r2_score(yt, yp)
    rmse = metrics.mean_absolute_error(yt, yp)
    # save our model, including scalers and feature agglomerator
    with open('RFR_trained_model.pickle', 'wb') as output:
        pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)

    return r2, rmse
Beispiel #2
0
    def test_feature_agglomoration(self, n, data, corpus, docvecs):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        affinities = ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"]
        linkages = ["ward", "complete", "average"]
        agglos = []

        for linkage in linkages:
            if linkage is "ward":
                agglos.append(FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage=linkage))
            else:
                for affinity in affinities:
                    agglos.append(FeatureAgglomeration(n_clusters=n, affinity=affinity, linkage=linkage))

        for agglo in agglos:
            print(agglo.get_params)
            reduced_vectors = agglo.fit_transform(data)

            clusters_kmeans = self.cluster_kmeans(docvecs, reduced_vectors=reduced_vectors, feature_names=corpus)
            labels_db = self.cluster_dbscan(reduced_vectors)
            #labels_hdb = self.cluster_hdbscan(reduced_vectors)
            clusters_db = self.get_clusters(corpus, labels_db)
            #clusters_hdb = self.get_clusters(corpus, labels_hdb)
        #agglo = FeatureAgglomeration(n_clusters=n, affinity="euclidean", linkage="ward")
        #return agglo.fit_transform(data)
        return
Beispiel #3
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9):
    numeric_cols = numeric_cols
    numdf = df[numeric_cols]
    r_in_x = numdf.corr()
    r_in_x = abs(r_in_x)
    distance_in_x = 1 / r_in_x
    for i in range(r_in_x.shape[0]):
            distance_in_x.iloc[i, i] = 10 ^ 10


    cpdist = distance_in_x.copy()

    cpdist = cpdist.fillna(cpdist.max().max())
    #df.isna().sum()

    from scipy.spatial.distance import correlation
    from sklearn.cluster import FeatureAgglomeration

    corrcoefmin = corrCoefThres
    fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin)
    fa.fit(cpdist)

    numdf.shape[1]
    fa.n_clusters_

    fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_})

    selectedFeatures = fadf.groupby("label").head(1)["feature"].values
    return selectedFeatures
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.])
    assert Xt_median == np.array([0.])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
Beispiel #6
0
def feature_agglomeration(voters_data, n, rounding=False):
    featagg = FeatureAgglomeration(n_clusters=n)
    featagg.fit(voters_data)
    condensed = featagg.transform(voters_data)

    feature_groups_map = dict(zip(voters_data.columns, featagg.labels_))
    feature_groups_nos = []
    for feature_group_key in feature_groups_map:
        feature_groups_nos.append(feature_groups_map[feature_group_key])
    feature_groups_nos

    group_labels = []
    for feature_group_no in set(feature_groups_nos):
        group_label = ""
        for feature_groups_key in feature_groups_map:
            if feature_groups_map[feature_groups_key] == feature_group_no:
                group_label = group_label + feature_groups_key + ", "
        group_labels.append(group_label[0:-2])
    group_labels

    voters_agglomerated = pd.DataFrame(condensed,
                                       columns=group_labels,
                                       index=voters_data.index)
    if rounding == True:
        voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x))
    print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format(
        len(voters_data.columns), len(voters_agglomerated.columns)))
    return voters_agglomerated
    def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False):
        compressions = []

        # Finding features
        diffnet = DiffNet(self.db, db_path=self.db_path)
        diffnet.restore(diffnet_paht)
        print("Calculating features for", len(cluster_images), "images")
        for img in cluster_images:
            print("Finding features for:", img)
            one_hot = diffnet.feedforward(img, cluster_db_path)
            output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.})
            compressions.append(output[0])

        # Clustering
        print("Performing clustering...")
        compressions = np.array(compressions)
        fa = FeatureAgglomeration(n_clusters=30)
        X_clusters = fa.fit_transform(compressions)

        print("Collecting data...")
        csv_dict_arr = []
        for i, img in enumerate(cluster_images):
            csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]})

        # Saving
        if save_csv:
            print("Saving data to csv...")
            keys = load_label_list(csv_dict_arr[0])
            with open('cluster_result.csv', 'w') as output_file:
                dict_writer = csv.DictWriter(output_file, keys, delimiter=';')
                dict_writer.writeheader()
                dict_writer.writerows(csv_dict_arr)

        return csv_dict_arr
def _feature_agglomeration_fit_method(data, n_parcels, connectivity, linkage):
    """Feature Agglomeration algorithm to fit on the data.

    Parameters
    ----------
    data : array_like, shape=(n_samples, n_voxels)
        Masked subjects data

    n_parcels : int
        Number of parcels to parcellate.

    connectivity : ndarray
        Connectivity matrix
        Defines for each feature the neighbouring features following a given
        structure of the data.

    linkage : str
        which linkage criterion to use.
        'ward' or 'linkage' or 'average'

    Returns
    -------
    labels : ndarray
        Labels to the data
    """
    ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity,
                                linkage=linkage)
    ward.fit(data)

    return ward.labels_
Beispiel #9
0
def TestSGDRegression(df1):

    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand',
                    ['any_regressand', 'X25', 'X26', 'X29', 'X13'])

    # build our design matrices
    X = dmatrix(eqn.replace('regressand ~ ', '0+'),
                data=df1,
                return_type='dataframe')

    # load our model, including scalers and feature agglomerator
    with open('SGD_trained_model.pickle', 'rb') as input:
        res = pickle.load(input)

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X)
    reduced_X = X_reduction.transform(X)

    # standardize our data
    X_scaler = StandardScaler().fit(reduced_X)
    std_X = X_scaler.transform(reduced_X)

    # predict the interest rates
    yp = res.predict(std_X)

    return yp
def feature_agglomeration(X, args={}):
    """
    使用层次聚类对特征进行聚类,然后进行特征降维
    """
    from sklearn.cluster import FeatureAgglomeration
    fam = FeatureAgglomeration(**args)
    fam.fit(X)
    return fam
Beispiel #11
0
class FeatureAgglomerationDecomposer(Transformer):
    type = 11

    def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean',
                 random_state=1):
        super().__init__("feature_agglomeration_decomposer")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = NUMERICAL

        self.n_clusters = n_clusters
        self.affinity = affinity
        self.linkage = linkage
        self.pooling_func = pooling_func
        self.random_state = random_state

        self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max)

    @ease_trans
    def operate(self, input_datanode, target_fields=None):
        from sklearn.cluster import FeatureAgglomeration

        X, y = input_datanode.data

        if self.model is None:
            self.n_clusters = int(self.n_clusters)

            n_clusters = min(self.n_clusters, X.shape[1])
            if not callable(self.pooling_func):
                self.pooling_func = self.pooling_func_mapping[self.pooling_func]

            self.model = FeatureAgglomeration(
                n_clusters=n_clusters, affinity=self.affinity,
                linkage=self.linkage, pooling_func=self.pooling_func)
            self.model.fit(X)

        X_new = self.model.transform(X)

        return X_new

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        cs = ConfigurationSpace()
        n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25)
        affinity = CategoricalHyperparameter(
            "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean")
        linkage = CategoricalHyperparameter(
            "linkage", ["ward", "complete", "average"], default_value="ward")
        pooling_func = CategoricalHyperparameter(
            "pooling_func", ["mean", "median", "max"], default_value="mean")

        cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func])

        affinity_and_linkage = ForbiddenAndConjunction(
            ForbiddenInClause(affinity, ["manhattan", "cosine"]),
            ForbiddenEqualsClause(linkage, "ward"))
        cs.add_forbidden_clause(affinity_and_linkage)
        return cs
Beispiel #12
0
def untangle(X: Iterable,
             y: Iterable,
             n_clusters: int = None,
             get_connectivity: bool = True,
             compute_distances: bool = True,
             kind: str = 'correlation',
             agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration:

    from nilearn.connectome import ConnectivityMeasure as CM
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.covariance import LedoitWolf
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import mutual_info_classif

    agglo_defs = dict(affinity='euclidean',
                      compute_full_tree='auto',
                      linkage='ward',
                      pooling_func=np.mean,
                      distance_threshold=None,
                      compute_distances=compute_distances)

    if get_connectivity is True:
        connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0]
    else:
        connect_mat = None

    if n_clusters is None:
        n_clusters = divmod(X.shape[1], 2)[0] - 1
        if n_clusters == 0:
            n_clusters = 1

    if agglo_kws is None:
        agglo_kws = {}
    agglo_defs.update(agglo_kws)

    agglo = FeatureAgglomeration(n_clusters=n_clusters,
                                 connectivity=connect_mat,
                                 **agglo_defs)
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    agglo.fit(X, y)

    setattr(
        agglo, 'cluster_indexes_',
        pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_),
                     columns=['cluster',
                              'feature']).groupby('cluster').feature)

    skb = SelectKBest(k=1, score_func=mutual_info_classif)
    factor_leaders_ = [
        skb.fit(X[itm[1]], y).get_feature_names_out()[0]
        for itm in tuple(agglo.cluster_indexes_)
    ]
    setattr(agglo, 'factor_leaders_', factor_leaders_)
    return agglo
Beispiel #13
0
def featureagglomeration(data_train, data_test, label_train, label_test, args):
    print('feature agglomeration')
    FA = FeatureAgglomeration(n_clusters=10).fit(data_train)
    transformation = FA.transform(data_test)
    agglomeration = find_highest(transformation)
    print('feature agglomeration done')
    compare_class(agglomeration, label_test)
    if args.create_mean:
        create_images_from_rows('fa', mean_image(agglomeration, data_test))
def get_encoder(metas, train_data, target_output_dim):
    tmpdir = metas['workspace']
    model_path = os.path.join(tmpdir, 'feature_agglomeration.model')

    model = FeatureAgglomeration(n_clusters=target_output_dim)
    model.fit(train_data)
    pickle.dump(model, open(model_path, 'wb'))

    return FeatureAgglomerationEncoder(model_path=model_path)
Beispiel #15
0
    def do_feature_agglomoration(self, data):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        if self.k:
            n = self.k
        else:
            n = 20

        agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete")
        return agglo.fit_transform(data)
    def token_cluster(self, n_clusters=300):

        from scipy import sparse
        from sklearn.cluster import FeatureAgglomeration

        FA = FeatureAgglomeration(n_clusters=3000)

        self.bow_corpus = FA.fit_transform(self.bow_corpus)
        self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
Beispiel #17
0
def get_clusters(X: pd.DataFrame, n_clusters: int):
    clt = FeatureAgglomeration(n_clusters=n_clusters)
    clt.fit(X)

    clusters = []

    for i in range(n_clusters):
        clusters.append(X.columns[clt.labels_ == i].tolist())

    return clusters  # type: list[str]
def dim_reduction_FA(data, distance_threshold=0.45):
    """
    Params:
        data: ndarry of shape (n_samples, n_features)
        distance_threshold: Optimal threshold value for similarity measure
    Returns: (reducedDimData, nReducedComponents)
    """
    agglo = FeatureAgglomeration(n_clusters=None,distance_threshold=distance_threshold)
    reducedDimData = agglo.fit_transform(data)
    return reducedDimData, agglo.n_clusters_
def apply_feature_agglomeration(table, features, label, n_components):
    from sklearn.cluster import FeatureAgglomeration
    from paje import feature_file_processor

    x, y = feature_file_processor.split_features_target(table, features, label)

    fa = FeatureAgglomeration(n_clusters=n_components, linkage='ward')
    pc = fa.fit_transform(x)

    return feature_file_processor.generate_data_frame(pc, table[[label]])
Beispiel #20
0
def test_feature_agglomeration_feature_names_out():
    """Check `get_feature_names_out` for `FeatureAgglomeration`."""
    X, _ = make_blobs(n_features=6, random_state=0)
    agglo = FeatureAgglomeration(n_clusters=3)
    agglo.fit(X)
    n_clusters = agglo.n_clusters_

    names_out = agglo.get_feature_names_out()
    assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)],
                       names_out)
def main():

    # Parameters
    data_directory = '../../data/generated-data-r-10-n-6-4/'
    features_path = '../../data/features-generated-data-r-10-n-6-4'
    booking_file = '../../data/booking.csv'
    users_file = '../../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5]
    false_objects_indexes = [6, 7, 8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores_auc = []
    scores_rmse = []
    for i in range(10):
        cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='rf')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv'
    save_scores(scores_rmse, results_file)
Beispiel #22
0
def getDR(dt_all, n_comp=12):
    # cols
    cols_encode_label = dt_all.filter(
        regex="Encode_Label").columns.values.tolist()
    cols_cat = dt_all.drop(
        "ID", axis=1).select_dtypes(include=["object"]).columns.tolist()

    # standardize
    dt_all_norm = MinMaxScaler().fit_transform(
        dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1))

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(dt_all_norm)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(dt_all_norm)

    # ICA
    ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420)
    ica_results = ica.fit_transform(dt_all_norm)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results = grp.fit_transform(dt_all_norm)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results = srp.fit_transform(dt_all_norm)

    # NMF
    nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
    nmf_results = nmf.fit_transform(dt_all_norm)

    # F*G
    f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
    fag_results = f*g.fit_transform(dt_all_norm)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1]
        dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1]
        dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1]
        dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1]
        dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1]
        dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1]
        dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1]

    return (dt_all)
Beispiel #23
0
    def setup(self, keywords={}):
        """
        Setup the algorithms
        """
        for p in keywords.keys():
            setattr(self, p, keywords[p])

        if self.method == "agglomerative": self.obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage, 
                affinity=self.affinity)
        if self.method == "feature": self.obj = FeatureAgglomeration(n_clusters=self.n_clusters, linkage=self.linkage,
                                affinity=self.affinity, distance_threshold=self.distance_threshold)
        return
Beispiel #24
0
 def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15):
     X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df)
     agglo = FeatureAgglomeration(n_clusters=n_clusters)
     if len(X_transformed) > 20000:
         X_agglo = X_transformed.sample(20000)
     else:
         X_agglo = X_transformed.copy()
     agglo.fit(X_agglo)
     vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(),
                                        'cluster':list(agglo.labels_)})\
                       .sort_values('cluster')
     return vars_clusters, X_transformed
Beispiel #25
0
 def cont_feature_clusters_sklearn(self, n_clusters = 5):
     """ This uses feature agglomeration from scikit learn and only works for continuous variables
         Eventually expand this to categorical variables using Cramer's V covariance matrix similar to 
         R tool using the iclust package """   
         
     #Import the library
     from sklearn.cluster import FeatureAgglomeration
     
     Cluster = FeatureAgglomeration(n_clusters=n_clusters)
     Cluster.fit(self._dataset.iloc[:,self._cont_index_predictors])
     
     df = pd.DataFrame({'Variable':self._dataset.columns[self._cont_index_predictors], 'Cluster':Cluster.labels_})
     
     return df.sort_values(by='Cluster')
Beispiel #26
0
def fa_dim_red(x_train_scaled, dataset_name, features_num = 2):
    z=0
    losses = []
    for k in range(1, x_train_scaled.shape[1]+1):
        fa = FeatureAgglomeration(n_clusters=k)
        fa_result = fa.fit_transform(x_train_scaled)
        x_projected_fa = fa.inverse_transform(fa_result)
        loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
        losses.append(loss)
            
    np_feature_losses_percent = np.multiply(100, losses/np.sum(losses))
    print('num of clustrs < 10% loss')
    for i in range(len(np_feature_losses_percent)):
        z=z+np_feature_losses_percent[i]
        if z>90:
            print(i+1)
            break
    print(np_feature_losses_percent)
    plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent)
    plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error (% of Total)")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' fa analysis.png')
    plt.show()

    fa = FeatureAgglomeration(n_clusters=features_num)
    fa_result = fa.fit_transform(x_train_scaled, y_train)
    print(fa_result.shape)
    x_projected_fa = fa.inverse_transform(fa_result)
    print(x_projected_ica.shape)
    print(x_train_scaled.shape)
    loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
    print('loss')
    print(loss)
    return fa_result,x_projected_fa
Beispiel #27
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices,
                        connectivity, n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Beispiel #28
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity,
                        n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters)
    assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters)
    assert_true(np.size(agglo_mean.labels_) == X.shape[1])
    assert_true(np.size(agglo_median.labels_) == X.shape[1])

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert_true(Xt_mean.shape[1] == n_clusters)
    assert_true(Xt_median.shape[1] == n_clusters)
    assert_true(Xt_mean == np.array([1 / 3.]))
    assert_true(Xt_median == np.array([0.]))

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert_true(np.unique(X_full_mean[0]).size == n_clusters)
    assert_true(np.unique(X_full_median[0]).size == n_clusters)

    assert_array_almost_equal(agglo_mean.transform(X_full_mean),
                              Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median),
                              Xt_median)
Beispiel #30
0
def cluster_sentences(sentences, nb_of_clusters=5):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                       stop_words=stopwords.words('english'),
                                       max_df=0.9,
                                       min_df=0.05,
                                       lowercase=True)
    #builds a tf-idf matrix for the sentences
    tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences)
    tfidf_matrix = tfidf_matrix_1.todense()
    kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters)
    kmeans.fit(tfidf_matrix)
    clusters = collections.defaultdict(list)
    for i, label in enumerate(kmeans.labels_):
        clusters[label].append(i)
    return dict(clusters)
Beispiel #31
0
def data_compression(fmri_masked, mask_img, mask_np, output_size):
    """
    data : array_like
         A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
         The functional dataset that needs to be reduced
    mask : a numpy array of the mask
    output_size : integer
        The number of elements that the data should be reduced to
        
    """

    ## Transform nifti files to a data matrix with the NiftiMasker
    import time
    from nilearn import input_data

    datacompressiontime = time.time()
    nifti_masker = input_data.NiftiMasker(mask_img=mask_img,
                                          memory='nilearn_cache',
                                          mask_strategy='background',
                                          memory_level=1,
                                          standardize=False)

    ward = []

    # Perform Ward clustering
    from sklearn.feature_extraction import image
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    #import pdb;pdb.set_trace()
    from sklearn.cluster import FeatureAgglomeration
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=output_size,
                                connectivity=connectivity,
                                linkage='ward')
    ward.fit(fmri_masked)
    #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))

    labels = ward.labels_

    #print ('Extracting reduced Dimension Data')
    data_reduced = ward.transform(fmri_masked)
    fmri_masked = []
    #print('Data compression took ', (time.time()- datacompressiontime), ' seconds')
    return {'data': data_reduced, 'labels': labels}
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    with pytest.raises(ValueError):
        AgglomerativeClustering(linkage='foo').fit(X)

    with pytest.raises(ValueError):
        linkage_tree(X, linkage='foo')

    with pytest.raises(ValueError):
        linkage_tree(X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
Beispiel #33
0
    def comput_coefs(self, X, y, size):
        cv = KFold(2)  # cross-validation generator for model selection
        ridge = BayesianRidge()
        cachedir = tempfile.mkdtemp()
        mem = Memory(cachedir=cachedir, verbose=1)

        # Ward agglomeration followed by BayesianRidge
        connectivity = grid_to_graph(n_x=size, n_y=size)
        ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity,
                                    memory=mem)
        clf = Pipeline([('ward', ward), ('ridge', ridge)])
        # Select the optimal number of parcels with grid search
        clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
        coef_agglomeration_ = coef_.reshape(size, size)

        # Anova univariate feature selection followed by BayesianRidge
        f_regression = mem.cache(feature_selection.f_regression)  # caching function
        anova = feature_selection.SelectPercentile(f_regression)
        clf = Pipeline([('anova', anova), ('ridge', ridge)])
        # Select the optimal percentage of features with grid search
        clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv)
        clf.fit(X, y)  # set the best parameters
        coef_ = clf.best_estimator_.steps[-1][1].coef_
        coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1))
        coef_selection_ = coef_.reshape(size, size)
        return dict(
            coef_selection_=coef_selection_,
            coef_agglomeration_=coef_agglomeration_,
            cachedir=cachedir
        )
Beispiel #34
0
def createPipe(embed, classif, nmca, aggregation, nsubs):
    # Dimension Reduction
    n_comp = 20 if nsubs > 70 else 15
    if embed == "pca":
        emb = ('pca', PCA(n_components=n_comp))
    else:
        emb = ('fa', FeatureAgglomeration(n_clusters=n_comp))

    # Classifiers
    neib = int(nmca * nsubs * 0.1) if aggregation == "mega" else int(nsubs *
                                                                     0.1)
    clfs = {
        'svc':
        ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e6)),
        'knn': ('knn', KNeighborsClassifier(n_neighbors=neib)),
        'rfc': ('rfc', RandomForestClassifier(class_weight="balanced")),
        'ada': ('ada', AdaBoostClassifier()),
        'lrc': ('lrc',
                LogisticRegression(class_weight="balanced",
                                   solver='liblinear',
                                   max_iter=1e6))
    }

    pipe = Pipeline(steps=[emb, clfs[classif]])
    return pipe
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
Beispiel #37
0
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features
Beispiel #38
0
 def __init__(self):
     self.clf = Pipeline([
         ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                      n_jobs=N_JOBS))])
     self.scaler = StandardScaler()
     self.agglo = FeatureAgglomeration(n_clusters=500)
# Transpose and scale parameters
featListOriginal[[0,1,5,7,9,10],:] = featListOriginal[[0,1,5,7,9,10],:]*0.8
featListOriginal[[2,3,4],:] = featListOriginal[[2,3,4],:]*0.8*0.2
featListOriginal[6,:] = featListOriginal[6,:]*0.8*0.8
featListOriginal = NP.transpose(featListOriginal)

## STANDARDIZE FEATURES ###################################################

# Don't standardize the centroids
for k in range(2,numFeat):
	featList[k] = (featList[k] - NP.mean(featList[k]))/NP.sqrt(NP.var(featList[k]))

# Transpose the feature list to use in clustering
featList = NP.transpose(featList)
feat_aggl = FeatureAgglomeration(2)
feat_aggl.fit(featList[:,2:])

## AGGLOMERATIVE CLUSTERING ###############################################

aggl_all = AgglomerativeClustering(2)
X_All = featList[:,2:]
y2 = aggl_all.fit_predict(X_All)

## PCA ###############################################################

pca_model = PCA(2)
X_PCA = pca_model.fit_transform(X_All)
print(pca_model.explained_variance_ratio_)

## SPLIT INTO numBINS ###############################################
	def FeatureAgglomeration(self, clist, numClusters=2):
		FEATAGGL = FeatureAgglomeration(numClusters)
		FEATAGGL.fit(self.featList[:,clist])
		self.featureTree = FEATAGGL.children_	
		self.featureLabels = FEATAGGL.labels_
		self.featureCList = clist
Beispiel #41
0
from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import FeatureAgglomeration
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import KFold
from sklearn import metrics

###############################################################################
# Data IO and generation
# import some data to play with
#file = "/home/kbhalla/Desktop/Data/day_samp_new.npy"
file = "/home/rmendoza/Documents/Data/day_samp_new_0604.npy"
with open(file, "r") as file_in:
        matrix = smio.load_sparse_csr(file_in)

X = matrix[:,:-1]
FA = FeatureAgglomeration(n_clusters=250)
print np.shape(X)
y = matrix[:,-1]
X = FA.fit_transform(X,y)
n_samples, n_features = X.shape
k = int(0.8*n_samples)
#random_state = np.random.RandomState(0)
#X = np.c_[X, random_state.randn(n_samples, 2*n_features)]
X_test, y_test = X[k:,:], y[k:]
X, y = X[:k, :], y[:k]
sm = SMOTE(ratio=0.95)
X,y = sm.fit_sample(X, y)
print np.shape(X)
start = time.time()


##################################################################
# Then we use FeatureAgglomeration from scikit-learn. Indeed, the voxels
# are the features of the data matrix.
#
# In addition, we use caching. As a result, the clustering doesn't have
# to be recomputed later.

# Computing the ward for the first time, this is long...
from sklearn.cluster import FeatureAgglomeration
# If you have scikit-learn older than 0.14, you need to import
# WardAgglomeration instead of FeatureAgglomeration
import time
start = time.time()
ward = FeatureAgglomeration(n_clusters=1000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 1000 clusters: %.2fs" % (time.time() - start))

# Compute the ward with more clusters, should be faster as we are using
# the caching mechanism
start = time.time()
ward = FeatureAgglomeration(n_clusters=2000, connectivity=connectivity,
                            linkage='ward', memory='nilearn_cache')
ward.fit(fmri_masked)
print("Ward agglomeration 2000 clusters: %.2fs" % (time.time() - start))

##################################################################
# Visualize results
# ------------------
#
    
def find_cluster(childrens, start, height, sample_size):
	res = start
	for i in range(height - 1):
		res = find_feature(childrens, res + sample_size)
	cluster = rec_cluster(childrens, childrens[res], sample_size)
	cluster.sort()
	return cluster
	
def find_feature_cluster(children, feature, height, sample_size):
	return find_cluster(children, find_feature(children, feature), height, sample_size)
		


BENCH_DATA = genfromtxt('Sequential_Application_SATUNSAT_track_wo_names.csv', delimiter=',')
#BENCH_DATA = BENCH_DATA.transpose()
print(BENCH_DATA.shape)
#print(np.isnan(BENCH_DATA).any())

ward = FeatureAgglomeration(linkage='average')

#print(ward.fit_predict(BENCH_DATA))
ward.fit(BENCH_DATA)
#print(ward.children_)
#print(find_feature_cluster(ward.children_, 0, 2, 300))

plt.title('SAT Feature_Agglomeration')
plot_dendrogram(ward, leaf_font_size = 12)
#plt.savefig('SAT_Feature_Agglomeration.png')
plt.show()