Esempio n. 1
0
def test_vrp_transform(max_edge_length, infinity_values):
    vrp = VietorisRipsPersistence(max_edge_length=max_edge_length,
                                  infinity_values=infinity_values)
    # This is not generally true, it is only a way to obtain the res array
    # in this specific case
    X_res = X_vrp_res.copy()
    X_res[:, :, :2][X_res[:, :, :2] >= max_edge_length] = infinity_values
    assert_almost_equal(vrp.fit_transform(X), X_res)
 def parallel_embed_(self, embedding):
     vr = VietorisRipsPersistence(
         metric='euclidean',
         homology_dimensions=self.homology_dimensions_,
         n_jobs=self.n_job)
     diagram_scaler = Scaler(n_jobs=self.n_job)
     persistence_diagrams = diagram_scaler.fit_transform(
         vr.fit_transform([embedding]))
     if self.filtering_:
         diagram_filter = Filtering(
             epsilon=0.1, homology_dimensions=self.filtering_dimensions_)
         persistence_diagrams = diagram_filter.fit_transform(
             persistence_diagrams)
     return persistence_diagrams[0]
Esempio n. 3
0
def vr_persistent_homology(patch_pc):
    homology_dimensions = (0, 1, 2)
    VR = VietorisRipsPersistence(
        metric="euclidean",
        max_edge_length=5,
        homology_dimensions=homology_dimensions,
        n_jobs=N_JOBS,
    )
    diagrams_VietorisRips = VR.fit_transform(np.asarray(patch_pc))
    VR.plot(diagrams_VietorisRips).show()
    BC = BettiCurve()
    X_betti_curves = BC.fit_transform(diagrams_VietorisRips)
    BC.plot(X_betti_curves).show()
    return diagrams_VietorisRips
def computing_persistence_diagram(G, t=np.inf, homologyDimensions = (0, 1, 2)):
    """
    INPUT:
        G: a graph
        t: persistence threshold
        homologyDimensions: homology dimensions to consider
    OUTPUT:
        pd: persistence diagram calculated by Giotto
    """

    dist_mat = computing_distance_matrix(G)
    persistenceDiagram = VietorisRipsPersistence(metric='precomputed', max_edge_length=t,
                                                 homology_dimensions=homologyDimensions,
                                                 n_jobs=-1)
    Diagrams = persistenceDiagram.fit_transform(dist_mat.reshape(1, dist_mat.shape[0], dist_mat.shape[1]))
    return Diagrams
Esempio n. 5
0
def get_persistent_entropy(point_clouds):
    ''' Creates Vietoris Rips Filtration and calculates Persistent Entropy

        Returns
        -------
        List with persistent entropy of 0th homology group for each series
    '''
    vietorisrips_tr = VietorisRipsPersistence(
        metric='manhattan',
        homology_dimensions=_homology_dimensions,
        max_edge_length=_max_edge_length,
        n_jobs=_n_jobs,
    )
    diagrams = vietorisrips_tr.fit_transform(point_clouds)

    entropy_tr = PersistenceEntropy()
    features = entropy_tr.fit_transform(diagrams)

    return features
Esempio n. 6
0
def extract_top_features(X, filtrations, vectorizations):
    """
    Extracts topological features from a MNIST-like dataset. 
    
    For each specified filtration and vectorization, features are extracted
    according to the pipeline:
    Filtration -> Persistence diagram -> Rescaling -> Vectorization.

    Parameters
    ----------
    X : ndarray of shape (n_samples, 28, 28)
        A collection of greyscale images.
        
    filtrations : list of tuples (string, filtration)
        A list of filtrations.
        Assumptions: 1) The first filtration is 'Voxel', the second is
                        'Binary', and for both of them the pipeline is
                        to be run on the original greyscale images. For all
                        subsequent filtrations, the pipeline is to be run on
                        binarized images.
                     2) For all filtrations except 'Vietoris-Rips', the
                        corresponding diagram is the cubical persistence
                        diagram. For 'Vietoris-Rips', i's the Vietoris-Rips
                        persistence diagram.
                    
    vectorizations : list of tuples (string, vectorization)
        A list of vectorizations.
        
    Returns
    -------
    X_f : ndarray of shape (n_samples, n_features)
        Topological features for all images in X
        
    """
    # Put all vectorizations together for convenience
    vect_union = FeatureUnion(vectorizations, n_jobs=num_jobs)

    X_bin = img.Binarizer(threshold=0.4, n_jobs=num_jobs).fit_transform(X)

    X_f = np.array([]).reshape(X.shape[0], 0)
    current_time = [time.perf_counter()]
    for filt in filtrations:
        filt_features = make_pipeline(\
            filt[1],\
            VietorisRipsPersistence(n_jobs=num_jobs) if filt[0] == 'Vietoris-Rips' else CubicalPersistence(n_jobs=num_jobs),\
            Scaler(n_jobs=num_jobs),\
            vect_union).fit_transform(X)
        X_f = np.hstack((X_f, filt_features))
        print("{} complete: {} seconds".format(filt[0],
                                               elapsed_time(current_time)))
        if filt[0] == 'Binary':
            X = X_bin  # From now on, we only work with binarized images

    return X_f
def fpd_cluster(data,
                c,
                hom_dimension,
                metric='wasserstein',
                verbose=False,
                max_iter=10,
                frand='no',
                fuzzy=True):
    # Compute topological fuzzy clusters of a collection of point clouds
    #
    # INPUTS
    # data - collection of datasets
    # c - number of clusters
    # verbose - True or False to give iteration information
    # max_iter - max number of iterations to compute
    # p - dimension of persistence diagram (0=connected components, 1=holes, 2=voids, etc.)
    # max_range - Max distance to consider between points for VR complex
    # T - replace points at infinity with large hyperparameter T
    # frand - optional Fuzzy RAND reference matrix
    # fuzzy - fuzzy clustering if True, hard clustering if False
    # (if unsure of value for max_range or T, set as the furthest distance between two points)
    #
    # OUTPUTS
    # r - membership values
    # M - list of cluster centres
    # frand_indices - returns Fuzzy RAND index at each iteration (if reference matrix given)

    VR = VietorisRipsPersistence(homology_dimensions=[hom_dimension])
    diagrams = VR.fit_transform(data)
    # diagrams = np.delete(diagrams, axis=2, obj=2)
    r, M = pd_fuzzy(diagrams,
                    c,
                    verbose,
                    max_iter,
                    frand=frand,
                    fuzzy=fuzzy,
                    metric=metric)

    return r, M
def get_pd_from_molecule(molecule_name, structures):
    """
    INPUT:
        molecule_name: name of the molecule as given in the structres file
        structures: structures file containing information (x, y, z coordinates) for all molecules

    OUTPUT:
        X_scaled: scaled persistence diagrams
    """
    m = structures[structures['molecule_name'] == molecule_name][['x', 'y', 'z']].to_numpy()
    m = m.reshape((1, m.shape[0], m.shape[1]))
    homology_dimensions = [0, 1, 2]
    persistenceDiagram = VietorisRipsPersistence(metric='euclidean',
                                                homology_dimensions=homology_dimensions, n_jobs=1)
    persistenceDiagram.fit(m)
    X_diagrams = persistenceDiagram.transform(m)

    diagram_scaler = diag.Scaler()
    diagram_scaler.fit(X_diagrams)
    X_scaled = diagram_scaler.transform(X_diagrams)

    return X_scaled
Esempio n. 9
0
def get_diagrams_torch(point_clouds, maxdim = 1):
    # Calculates persistence diagrams from point clouds. 
    # Complexity of calculation increase with the maximum homology dimension, taken into account
    
    # point_clouds -  pytorch tensor of the shape (n_samples, n_points, dim)
    # maxdim - maximum homology dimension 
    
    # Returns tuple (diagrams_torch, diagrams_np, VR_persistence)
    #diagrams_torch - pytorch tensors of the shape (n_samples, maxdim + 1, n_features, 2)
    # n_features - maximum number of topological features, across different samples.
    # The last axis has the structure [birth_scale, death_scale]
    # The last two elements in tuple are needed for plotting diagrams only
    
    homology_dimensions = tuple(range(maxdim + 1))
    VR_persistence = VietorisRipsPersistence(homology_dimensions = homology_dimensions)
    point_clouds_np = point_clouds.numpy()
    diagrams_np = VR_persistence.fit_transform(point_clouds_np)
    homology_dimensions = diagrams_np[:, :, 2, np.newaxis]
    diagrams_torch = []
    for i in range(maxdim + 1):
        diagrams_fixed_Hdim = np.select([homology_dimensions == i], [diagrams_np[:, :, :2]])
        diagrams_torch.append(torch.FloatTensor(diagrams_fixed_Hdim[:, np.newaxis, :, :]))
    diagrams_torch = torch.cat(tuple(diagrams_torch), dim=1)
    return diagrams_torch, diagrams_np, VR_persistence
Esempio n. 10
0
    def _validate_k_fold_top(self, model, x_train, y_train, x_test, y_test):
        validation_quantities = []

        for k_min in self.k_mins:
            for k_max in self.k_maxs:
                for dist_percentage in self.dist_percentages:
                    print(
                        f"k_min, k_max, dist_percentage: {k_min}, {k_max}, {dist_percentage}"
                    )
                    pipeline_list = [
                        ('extract_subspaces',
                         SubSpaceExtraction(dist_percentage=dist_percentage,
                                            k_min=k_min,
                                            k_max=k_max,
                                            metric="euclidean",
                                            n_jobs=-1)),
                        ('compute_diagrams',
                         VietorisRipsPersistence(n_jobs=-1))
                    ]
                    top_pipeline = Pipeline(pipeline_list)

                    diagrams_train, _ = top_pipeline.fit_transform_resample(
                        x_train, y_train)

                    top_features_train = extract_topological_features(
                        diagrams_train)

                    x_train_model = np.concatenate(
                        [x_train, top_features_train], axis=1)
                    model.fit(x_train_model, y_train)

                    x_test_model = extract_features_for_prediction(
                        x_train, y_train, x_test, y_test, top_pipeline)

                    score = model.score(x_test_model, y_test)
                    output_dictionary = {
                        "k_min": k_min,
                        "k_max": k_max,
                        "dist_percentage": dist_percentage,
                        "score": score
                    }
                    validation_quantities.append(output_dictionary)

        return validation_quantities
def test_vrp_transform():
    vrp = VietorisRipsPersistence()

    assert_almost_equal(vrp.fit_transform(X), X_vrp_res)
def test_vrp_not_fitted():
    vrp = VietorisRipsPersistence()

    with pytest.raises(NotFittedError):
        vrp.transform(X)
def test_vrp_params():
    metric = 'not_defined'
    vrp = VietorisRipsPersistence(metric=metric)

    with pytest.raises(ValueError):
        vrp.fit_transform(X)
Esempio n. 14
0
def test_vrp_low_infinity_values(X, metric):
    vrp = VietorisRipsPersistence(max_edge_length=0.001,
                                  metric=metric,
                                  infinity_values=-1)
    assert_almost_equal(vrp.fit_transform(X)[:, :, :2], np.zeros((1, 2, 2)))
Esempio n. 15
0
def test_vrp_list_of_arrays_different_size():
    X_2 = np.array([[0., 1.], [1., 2.]])
    vrp = VietorisRipsPersistence()
    assert_almost_equal(vrp.fit_transform([X_pc[0], X_2])[0], X_vrp_exp[0])
Esempio n. 16
0
from biopandas.mol2 import PandasMol2
import numpy as np
import pandas as pd
import warnings
import os
warnings.filterwarnings('ignore')
from concurrent import futures
from gtda.homology import VietorisRipsPersistence


npoints = 15

persistence = VietorisRipsPersistence(
            metric="euclidean",
            homology_dimensions=[0,1,2],
            collapse_edges=True,
            n_jobs = None
    )


def get_local_cloud(prot_res):
        prot, res = prot_res
        tempdf = df.loc[prot]
        center = tempdf.loc[res, ['x', 'y', 'z']].to_numpy()

        tempdf['dist'] = np.sqrt((tempdf['x'] - center[0])**2 + (tempdf['y'] - center[1])**2 + (tempdf['z'] - center[2])**2)

        localcloud = tempdf.nsmallest(npoints, 'dist')[['x','y','z']].to_numpy()
        return localcloud

get_local_cloud = np.vectorize(get_local_cloud, otypes=[np.ndarray],)
Esempio n. 17
0
    def cross_validate(self, full_x, full_y, splitting_dates):
        train_split_date = splitting_dates[0]
        val_split_date = splitting_dates[1]
        end_date = splitting_dates[2]

        train_x = full_x[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]
        train_y = full_y[(full_x.date < train_split_date) |
                         (full_x.date >= end_date)]

        val_x = full_x[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]
        val_y = full_y[(full_x.date >= train_split_date)
                       & (full_x.date < val_split_date)]

        test_x = full_x[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]
        test_y = full_y[(full_x.date >= val_split_date)
                        & (full_x.date < end_date)]

        train_x.pop("date")
        val_x.pop("date")
        test_x.pop("date")

        train_x = train_x.values
        train_y = train_y.values
        val_x = val_x.values
        val_y = val_y.values
        test_x = test_x.values
        test_y = test_y.values

        print("START VALIDATING MODEL")
        models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y)
        best_model_params = best_combination(models_cv)
        best_model_params.pop("score")
        best_model = RandomForestClassifier(**best_model_params)

        best_model.fit(train_x, train_y)

        score = best_model.score(test_x, test_y)
        print(f'score no_top {score}')
        print(f'best model parameters no_top {best_model_params}')

        print("START VALIDATING PARAMS")
        topo_cv = self._validate_k_fold_top(best_model, train_x, train_y,
                                            val_x, val_y)
        best_topo = best_combination(topo_cv)
        best_topo.pop("score")
        best_topo_pipeline_list = [
            ('extract_subspaces', SubSpaceExtraction(**best_topo)),
            ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))
        ]
        best_topo_pipeline = Pipeline(best_topo_pipeline_list)

        train_x_for_test = np.concatenate([train_x, val_x], axis=0)
        train_y_for_test = np.concatenate([train_y, val_y], axis=0)

        diagrams_train, _ = best_topo_pipeline.fit_transform_resample(
            train_x_for_test, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TRAIN")
        top_features_train = extract_topological_features(diagrams_train)

        x_train_model = np.concatenate([train_x_for_test, top_features_train],
                                       axis=1)
        best_model.fit(x_train_model, train_y_for_test)

        print("EXTRACTING TOPOLOGICAL FEATURES TEST")
        x_test_model = extract_features_for_prediction(x_train_model,
                                                       train_y_for_test,
                                                       test_x, test_y,
                                                       best_topo_pipeline)

        score_top = best_model.score(x_test_model, test_y)

        val_x_with_topo = extract_features_for_prediction(
            train_x, train_y, val_x, val_y, best_topo_pipeline)

        print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY')
        model_config_with_topo = self._validate_k_fold_model(
            x_train_model, train_y, val_x_with_topo, val_y)
        best_model_config_with_topo = best_combination(model_config_with_topo)
        best_model_config_with_topo.pop('score')

        best_model_with_topo = RandomForestClassifier(
            **best_model_config_with_topo)
        best_model_with_topo.fit(x_train_model, train_y_for_test)

        score_best_topo_and_model = best_model_with_topo.score(
            x_test_model, test_y)
        print(f'score best model and topo_feat {score_best_topo_and_model}')

        return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model
Esempio n. 18
0
# Representing the circle in 3d with parametric equations.
circle = np.asarray([[np.sin(t), np.cos(t), 0] for t in range(400)])
plot_point_cloud(circle)

# Representing the sphere in 3d with parametric equations
sphere = np.asarray([[np.cos(s) * np.cos(t),
                      np.cos(s) * np.sin(t),
                      np.sin(s)] for t in range(20) for s in range(20)])
plot_point_cloud(sphere)

# Representing the torus in 3d with parametric equations
torus = np.asarray([[(2 + np.cos(s)) * np.cos(t), (2 + np.cos(s)) * np.sin(t),
                     np.sin(s)] for t in range(20) for s in range(20)])
plot_point_cloud(torus)

# Saving the results into an array
topological_spaces = np.asarray([circle, sphere, torus])

# The homology ranks we choose to consider
homology_dimensions = (0, 1, 2)
VR = VietorisRipsPersistence(metric='euclidean',
                             max_edge_length=10,
                             homology_dimensions=homology_dimensions)

# Array of persistence diagrams, one per point cloud in the input
diagrams = VR.fit_transform(topological_spaces)
print(f'diagrams.shape = {diagrams.shape}')

# Plotting the persistence diagram of the circle
plot_diagram(diagrams[0])
def test_vrp_list_of_arrays():
    X_2 = np.array([[0., 1.], [1., 2.]])
    X_list = [X[0].copy(), X_2]
    vrp = VietorisRipsPersistence()
    vrp.fit(X_list)
# ``X_sw`` is now a complicated-looking array, but it has a simple interpretation. Again, ``X_sw[i]`` is the ``i``-th window on ``X``, and it contains ``window_size`` samples from the original time series. This time, the samples are not scalars but 1D arrays.
#
# What if we suspect that the way in which the **correlations** between the variables evolve over time can help forecast the target ``y``? This is a common situation in neuroscience, where each variable could be data from a single EEG sensor, for instance.
#
# ``giotto-tda`` exposes a ``PearsonDissimilarity`` transformer which creates a 2D dissimilarity matrix from each window in ``X_sw``, and stacks them together into a single 3D object. This is the correct format (and information content!) for a typical topological transformer in ``gtda.homology``. See also [Topological feature extraction from graphs](https://github.com/giotto-ai/giotto-tda/blob/master/examples/persistent_homology_graphs.ipynb) for an in-depth look. Finally, we can extract simple scalar features using a selection of transformers in ``gtda.diagrams``.

# In[6]:

from gtda.time_series import PearsonDissimilarity
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import Amplitude

PD = PearsonDissimilarity()
X_pd = PD.fit_transform(X_sw)
VR = VietorisRipsPersistence(metric="precomputed")
X_vr = VR.fit_transform(X_pd)  # "precomputed" required on dissimilarity data
Ampl = Amplitude()
X_a = Ampl.fit_transform(X_vr)
X_vr

# Notice that we are not acting on ``y`` above. We are simply creating features from each window using topology! *Note*: it's two features per window because we used the default value for ``homology_dimensions`` in ``VietorisRipsPersistence``, not because we had two variables in the time series initially!
#
# We can now put this all together into a ``giotto-tda`` ``Pipeline`` which combines both the sliding window transformation on ``X`` and resampling of ``y`` with the feature extraction from the windows on ``X``.
#
# *Note*: while we could import the ``Pipeline`` class and use its constructor, we use the convenience function ``make_pipeline`` instead, which is a drop-in replacement for [scikit-learn's](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html).
#
# 请注意,我们没有对上面的“y”采取行动。我们只是在使用拓扑从每个窗口创建特征! *注意*:每个窗口有两个特征,因为我们在“VietorisRipsPersistence”中使用了“ homology_dimensions”的默认值,而不是因为我们最初在时间序列中有两个变量!
#
# 现在我们可以将所有这些放到giotto-tda“Pipeline”中,它将“X”上的滑动窗口转换和“y”的重采样与从“X”的Windows窗口上提取的特征结合在一起。
#
Esempio n. 21
0
        (3 * (np.arctan(np.cos(t) / np.sin(t)))) / 2)
] for t in range(1, 400)])
twistedcircle = np.concatenate((twistedcircle1, twistedcircle2))
plot_point_cloud(twistedcircle)

#Consider a specific example.
twistedcircle1 = np.asarray(
    [[np.sin(t),
      np.cos(t),
      np.cos(((np.arctan(np.cos(t) / np.sin(t)))) / 2)]
     for t in range(1, 400)])
twistedcircle2 = np.asarray(
    [[np.sin(t),
      np.cos(t), -np.cos(((np.arctan(np.cos(t) / np.sin(t)))) / 2)]
     for t in range(1, 400)])
twistedcircle = np.concatenate((twistedcircle1, twistedcircle2))
plot_point_cloud(twistedcircle)

# The homology ranks we choose to consider
homology_dimensions = (0, 1)
VR = VietorisRipsPersistence(metric='euclidean',
                             max_edge_length=10,
                             homology_dimensions=homology_dimensions)

# Creating persistence diagrams, one per point cloud in the input.
diagrams = VR.fit_transform([twistedcircle])
print(f'diagrams.shape = {diagrams.shape}')

# Plotting the persistence diagram of the twisted circle.
plot_diagram(diagrams[0])
Esempio n. 22
0
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)


#cmap = get_cmap(1000000)
knots = []
for index,i in enumerate([4000,5000,6000,7000]):
    for j in range(5):
        X = np.random.randn(i,3) / 1000
        X[:,0] += np.cos(np.arange(i)*2*np.pi/i)
        X[:,1] += np.sin(np.arange(i)*2*np.pi/i)
        Z = TSNE(n_jobs=-1, init='random', random_state=np.random.randint(5,42)).fit_transform(X)
        plt.scatter(Z[:,0] , Z[:,1] , c = np.random.rand(3,))
        plt.show()
        knots.append(Z)
        
homology_dimensions = (0, 1)
VR = VietorisRipsPersistence(
    metric='euclidean', homology_dimensions=homology_dimensions)

# Array of persistence diagrams, one per point cloud in the input
diagrams = VR.fit_transform(knots)

PE = PersistenceEntropy()
F = PE.fit_transform(diagrams)

C = AgglomerativeClustering(n_clusters=5).fit_transform(X)
print(C.labels_)


def test_vrp_fit_transform_plot(hom_dims):
    VietorisRipsPersistence().fit_transform_plot(
        X, sample=0, homology_dimensions=hom_dims)
Esempio n. 24
0
def get_pipeline(top_feat_params):
    pipeline = Pipeline([('extract_point_clouds', SubSpaceExtraction(**top_feat_params)),
                         ('create_diagrams', VietorisRipsPersistence(n_jobs=-1))])
    return pipeline