Exemple #1
0
    def _gen_mat_data(n: int = 20,
                      m: int = 20,
                      p: int = 0.50,
                      mat_type: str = 'sb',
                      binary: bool = False,
                      asfile: bool = True,
                      n_graphs: int = 1):
        if binary is True:
            wt = 1
        else:
            wt = np.random.uniform

        mat_list = []
        mat_file_list = []
        for nm in range(n_graphs):
            if mat_type == 'er':
                mat = largest_connected_component(
                    symmetrize(
                        remove_loops(
                            er_nm(n,
                                  m,
                                  wt=np.random.uniform,
                                  wtargs=dict(low=0, high=1)))))
            elif mat_type == 'sb':
                if p is None:
                    raise ValueError(
                        f"for mat_type {mat_type}, p cannot be None")
                mat = largest_connected_component(
                    symmetrize(
                        remove_loops(
                            sbm(np.array([n]),
                                np.array([[p]]),
                                wt=wt,
                                wtargs=dict(low=0, high=1)))))
            else:
                raise ValueError(f"mat_type {mat_type} not recognized!")

            mat_list.append(mat)

            if asfile is True:
                mat_path_tmp = tempfile.NamedTemporaryFile(mode='w+',
                                                           suffix='.npy',
                                                           delete=False)
                mat_path = str(mat_path_tmp.name)
                np.save(mat_path, mat)
                mat_file_list.append(mat_path)
                mat_path_tmp.close()

        return {'mat_list': mat_list, 'mat_file_list': mat_file_list}
Exemple #2
0
    def prune_graph(self):
        import graspologic.utils as gu
        from pynets.statistics.individual.algorithms import defragment, \
            prune_small_components, most_important

        hardcoded_params = utils.load_runconfig()

        if int(self.prune) not in range(0, 4):
            raise ValueError(f"Pruning option {self.prune} invalid!")

        if self.prune != 0:
            # Remove isolates
            G_tmp = self.G.copy()
            self.G = defragment(G_tmp)[0]
            del G_tmp

        if int(self.prune) == 1:
            try:
                self.G = prune_small_components(
                    self.G, min_nodes=hardcoded_params["min_nodes"][0])
            except BaseException:
                print(
                    UserWarning(f"Warning: pruning {self.est_path} "
                                f"failed..."))
        elif int(self.prune) == 2:
            try:
                hub_detection_method = \
                hardcoded_params["hub_detection_method"][0]
                print(f"Filtering for hubs on the basis of "
                      f"{hub_detection_method}...\n")
                self.G = most_important(self.G, method=hub_detection_method)[0]
            except FileNotFoundError as e:
                import sys
                print(e, "Failed to parse advanced.yaml")

        elif int(self.prune) == 3:
            print("Pruning all but the largest connected "
                  "component subgraph...")
            self.G = gu.largest_connected_component(self.G)
        else:
            print("No graph defragmentation applied...")

        self.G = nx.from_numpy_array(self.in_mat)

        if nx.is_empty(self.G) is True or \
            (np.abs(self.in_mat) < 0.0000001).all() or \
                self.G.number_of_edges() == 0:
            print(
                UserWarning(f"Warning: {self.est_path} "
                            f"empty after pruning!"))
            return self.in_mat, None

        # Saved pruned
        if (self.prune != 0) and (self.prune is not None):
            final_mat_path = f"{self.est_path.split('.npy')[0]}{'_pruned'}"
            utils.save_mat(self.in_mat, final_mat_path, self.out_fmt)
            print(f"{'Source File: '}{final_mat_path}")

        return self.in_mat, final_mat_path
Exemple #3
0
    def _gen_mat_data(n: int=20, m: int=20, p: int=0.50,
                      mat_type: str='sb', binary: bool=False,
                      asfile: bool=True, n_graphs: int=1,
                      lcc: bool=False, modality: str='func'):
        if binary is True:
            wt = 1
        else:
            wt = np.random.uniform

        mat_list = []
        mat_file_list = []

        if n_graphs > 0:
            for nm in range(n_graphs):
                if mat_type == 'er':
                    mat = symmetrize(
                        remove_loops(er_nm(n, m, wt=np.random.uniform,
                                           wtargs=dict(low=0, high=1))))
                elif mat_type == 'sb':
                    if p is None:
                        raise ValueError(
                            f"for mat_type {mat_type}, p cannot be None")
                    mat = symmetrize(
                        remove_loops(sbm(np.array([n]), np.array([[p]]),
                                         wt=wt, wtargs=dict(low=0,
                                                            high=1))))
                else:
                    raise ValueError(f"mat_type {mat_type} not recognized!")

                if lcc is True:
                    mat = largest_connected_component(mat)

                mat_list.append(autofix(mat))

                if asfile is True:
                    path_tmp = tempfile.NamedTemporaryFile(mode='w+',
                                                           suffix='.npy',
                                                           delete=False)
                    mat_path_tmp = str(path_tmp.name)
                    out_folder = f"{str(Path.home())}/test_mats"
                    os.makedirs(out_folder, exist_ok=True)

                    if modality == 'func':
                        mat_path = f"{out_folder}/graph_sub-999_modality-func_" \
                        f"model-corr_template-" \
                        f"MNI152_2mm_" \
                        f"parc_tol-6fwhm_hpass-" \
                        f"0Hz_" \
                        f"signal-mean_thrtype-prop_thr-" \
                        f"{round(random.uniform(0, 1),2)}.npy"
                    elif modality == 'dwi':
                        mat_path = f"{out_folder}/graph_sub-999_modality-func_" \
                        f"model-csa_template-" \
                        f"MNI152_2mm_tracktype-local_" \
                        f"traversal-det_minlength-30_" \
                        f"tol-5_thrtype-prop_thr-" \
                        f"{round(random.uniform(0, 1),2)}.npy"

                    shutil.copyfile(mat_path_tmp, mat_path)
                    np.save(mat_path, mat)
                    mat_file_list.append(mat_path)
                    path_tmp.close()

        return {'mat_list': mat_list, 'mat_file_list': mat_file_list}
def omnibus_embedding_pairwise(
    graphs: List[Union[nx.Graph, nx.OrderedGraph, nx.DiGraph,
                       nx.OrderedDiGraph]],
    dimensions: int = 100,
    elbow_cut: Optional[int] = None,
    svd_solver_algorithm: SvdAlgorithmType = "randomized",
    svd_solver_iterations: int = 5,
    svd_seed: Optional[int] = None,
    weight_attribute: str = "weight",
    use_laplacian: bool = False,
) -> List[Tuple[Embeddings, Embeddings]]:
    """
    Generates a pairwise omnibus embedding for each pair of graphs in a list of graphs using the adjacency matrix.
    If given graphs A, B, and C, the embeddings will be computed for A, B and B, C.

    If the node labels differ between each pair of graphs, then those nodes will only be found in the resulting embedding
    if they exist in the largest connected component of the union of all edges across all graphs in the time series.

    Graphs will always have their diagonal augmented. In other words, a self-loop
    will be created for each node with a weight corresponding to the weighted degree.

    Lastly, all weights will be rescaled based on their relative rank in the graph,
    which is beneficial in minimizing anomalous results if some edge weights are
    extremely atypical of the rest of the graph.

    Parameters
    ----------
    graphs : List[Union[nx.Graph, nx.OrderedGraph, nx.DiGraph, nx.OrderedDiGraph]]
          A list of undirected or directed graphs. The graphs **must**:

          - be fully numerically weighted (every edge must have a real, numeric weight
            or else it will be treated as an unweighted graph)
          - be a basic graph (meaning it should not be a multigraph; if you have a
            multigraph you must first decide how you want to handle the weights of the
            edges between two nodes, whether summed, averaged, last-wins,
            maximum-weight-only, etc)
    dimensions : int (default=100)
          Dimensions to use for the svd solver.
          For undirected graphs, if ``elbow_cut==None``, you will receive an embedding
          that has ``nodes`` rows and ``dimensions`` columns.
          For directed graphs, if ``elbow_cut==None``, you will receive an embedding that
          has ``nodes`` rows and ``2*dimensions`` columns.
          If ``elbow_cut`` is specified to be not ``None``, we will cut the embedding at
          ``elbow_cut`` elbow, but the provided ``dimensions`` will be used in the
          creation of the SVD.
    elbow_cut : Optional[int] (default=None)
          Using a process described by Zhu & Ghodsi in their paper "Automatic
          dimensionality selection from the scree plot via the use of profile likelihood",
          truncate the dimensionality of the return on the ``elbow_cut``-th elbow.
          By default this value is ``None`` but can be used to reduce the dimensionality
          of the returned tensors.
    svd_solver_algorithm : str (default="randomized")
          allowed values: {'randomized', 'full', 'truncated'}

          SVD solver to use:

              - 'randomized'
                  Computes randomized svd using
                  :func:`sklearn.utils.extmath.randomized_svd`
              - 'full'
                  Computes full svd using :func:`scipy.linalg.svd`
                  Does not support ``graph`` input of type scipy.sparse.csr_matrix
              - 'truncated'
                  Computes truncated svd using :func:`scipy.sparse.linalg.svds`
    svd_solver_iterations : int (default=5)
          Number of iterations for randomized SVD solver. Not used by 'full' or
          'truncated'. The default is larger than the default in randomized_svd
          to handle sparse matrices that may have large slowly decaying spectrum.
    svd_seed : Optional[int] (default=None)
          Used to seed the PRNG used in the ``randomized`` svd solver algorithm.
    weight_attribute : str (default="weight")
          The edge dictionary key that contains the weight of the edge.
    use_laplacian : bool (default=False)
          Determine whether to use the Laplacian matrix of each graph in order to
          calculate the omnibus embedding using the Laplacian spectral embedding
          technique.

    Returns
    -------
    List[Tuple[Embeddings, Embeddings]]

    Raises
    ------
    beartype.roar.BeartypeCallHintPepParamException if parameters do not match type hints
    ValueError if values are not within appropriate ranges or allowed values

    See Also
    --------
    graspologic.pipeline.embed.Embeddings
    graspologic.embed.OmnibusEmbed
    graspologic.embed.AdjacencySpectralEmbed
    graspologic.embed.select_svd

    References
    ----------
    .. [1] Levin, K., Athreya, A., Tang, M., Lyzinski, V., & Priebe, C. E. (2017,
         November). A central limit theorem for an omnibus embedding of multiple random
         dot product graphs. In Data Mining Workshops (ICDMW), 2017 IEEE International
         Conference on (pp. 964-967). IEEE.

    .. [2] Sussman, D.L., Tang, M., Fishkind, D.E., Priebe, C.E.  "A
         Consistent Adjacency Spectral Embedding for Stochastic Blockmodel Graphs,"
         Journal of the American Statistical Association, Vol. 107(499), 2012

    .. [3] Levin, K., Roosta-Khorasani, F., Mahoney, M. W., & Priebe, C. E. (2018).
          Out-of-sample extension of graph adjacency spectral embedding. PMLR: Proceedings
          of Machine Learning Research, 80, 2975-2984.

    .. [4] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the
          scree plot via the use of profile likelihood. Computational Statistics & Data
          Analysis, 51(2), pp.918-930.
    """
    check_argument(len(graphs) > 1, "more than one graph is required")

    check_argument(dimensions >= 1, "dimensions must be positive")

    check_argument(elbow_cut is None or elbow_cut >= 1,
                   "elbow_cut must be positive")

    check_argument(
        svd_solver_algorithm in __SVD_SOLVER_TYPES,
        f"svd_solver_algorithm must be one of the values in {','.join(__SVD_SOLVER_TYPES)}",
    )

    check_argument(svd_solver_iterations >= 1,
                   "svd_solver_iterations must be positive")

    check_argument(
        svd_seed is None or 0 <= svd_seed <= 2**32 - 1,
        "svd_seed must be a nonnegative, 32-bit integer",
    )

    used_weight_attribute = _graphs_precondition_checks(
        graphs, weight_attribute)
    perform_augment_diagonal = not use_laplacian

    graph_embeddings = []

    # create a graph that contains all nodes and edges across the entire corpus
    union_graph = graphs[0].copy()
    for graph in graphs[1:]:
        union_graph.add_edges_from(graph.edges())

    union_graph_lcc: Union[nx.Graph, nx.Digraph, nx.OrderedGraph,
                           nx.OrderedDiGraph] = largest_connected_component(
                               union_graph)
    union_graph_lcc_nodes: Set[Any] = set(list(union_graph_lcc.nodes()))

    union_node_ids = np.array(list(union_graph_lcc_nodes))

    previous_graph = graphs[0].copy()

    for graph in graphs[1:]:
        current_graph = graph.copy()

        # assure both graphs contain the exact same node set
        # by removing nodes or adding isolates as needed
        _sync_nodes(previous_graph, union_graph_lcc_nodes)
        _sync_nodes(current_graph, union_graph_lcc_nodes)

        # remove self loops, run pass to ranks and diagonal augmentation
        previous_graph_augmented = _augment_graph(
            previous_graph,
            union_graph_lcc_nodes,
            used_weight_attribute,
            perform_augment_diagonal=perform_augment_diagonal,
        )
        current_graph_augmented = _augment_graph(
            current_graph,
            union_graph_lcc_nodes,
            used_weight_attribute,
            perform_augment_diagonal=perform_augment_diagonal,
        )

        model = OmnibusEmbed(
            n_components=dimensions,
            n_elbows=None,  # we will do elbow cuts
            algorithm=svd_solver_algorithm,
            n_iter=svd_solver_iterations,
            check_lcc=False,
            diag_aug=False,
            concat=False,
            svd_seed=svd_seed,
            lse=use_laplacian,
        )

        previous_embedding, current_embedding = model.fit_transform(
            graphs=[previous_graph_augmented, current_graph_augmented])

        previous_embedding_cut = _elbow_cut_if_needed(elbow_cut,
                                                      graph.is_directed(),
                                                      model.singular_values_,
                                                      previous_embedding)

        current_embedding_cut = _elbow_cut_if_needed(elbow_cut,
                                                     graph.is_directed(),
                                                     model.singular_values_,
                                                     current_embedding)

        graph_embeddings.append((
            Embeddings(union_node_ids, previous_embedding_cut),
            Embeddings(union_node_ids, current_embedding_cut),
        ))

    return graph_embeddings
Exemple #5
0
def to_largest_connected_component(adj, meta=None):
    adj, lcc_inds = largest_connected_component(adj, return_inds=True)
    if meta is not None:
        return adj, meta.iloc[lcc_inds]
    else:
        return adj
    "DCER": DCEREstimator(directed=True, loops=False, degree_directed=False),
}

rerun_test = False

if rerun_test:
    currtime = time.time()

    n_null_samples = 100
    statistics = []

    for edge_type in edge_types:
        print(f"Edge type = {edge_type}")
        edge_type_adj = mg.to_edge_type_graph(edge_type).adj
        edge_type_adj = binarize(edge_type_adj)
        largest_connected_component(edge_type_adj)
        tstat = p_upper_tstat(edge_type_adj)

        observed = pd.DataFrame(index=[0])
        observed["estimated_p_upper"] = tstat
        observed["edge_type"] = edge_type
        observed["null_model"] = "Observed"
        statistics.append(observed)

        # estimate null distribution via bootstrap sampling
        for null_name, NullEstimator in null_estimators.items():
            ne = NullEstimator.fit(edge_type_adj)

            def sampler():
                return np.squeeze(ne.sample())