Python Bunch.adjacencyの例、sknetwork.utils.Bunch.adjacency Pythonの例

コード例 #1

0

ファイルを表示

def load_adjacency_list(
    file: str,
    bipartite: bool = False,
    comment: str = '%#',
    delimiter: str = None,
) -> Bunch:
    """Parse Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets in the form of
    adjacency lists.

    Parameters
    ----------
    file : str
        The path to the dataset in TSV format
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    comment : str
        Set of characters denoting lines to ignore.
    delimiter : str
        delimiter used in the file. None makes a guess

    Returns
    -------
    graph: :class:`Bunch`
    """
    header_len, guess_delimiter, _, _, _, _ = scan_header(file, comment)
    if delimiter is None:
        delimiter = guess_delimiter
    indptr, indices = [0], []
    with open(file, 'r', encoding='utf-8') as f:
        for i in range(header_len):
            f.readline()
        for row in f:
            neighbors = [int(el) for el in row.split(delimiter)]
            indices += neighbors
            indptr.append(indptr[-1] + len(neighbors))
    indices = np.array(indices)
    n_rows = len(indptr) - 1
    min_index = np.min(indices)
    n_cols = np.max(indices) + 1 - min_index
    indices -= min_index
    graph = Bunch()
    if not bipartite:
        max_dim = max(n_rows, n_cols)
        new_indptr = np.full(max_dim + 1, indptr[-1])
        new_indptr[:len(indptr)] = indptr
        graph.adjacency = sparse.csr_matrix(
            (np.ones_like(indices, dtype=int), indices, new_indptr),
            shape=(max_dim, max_dim))
        if max(graph.adjacency.data) == 1:
            graph.adjacency = graph.adjacency.astype(bool)
    else:
        indptr = np.array(indptr)
        graph.biadjacency = sparse.csr_matrix(
            (np.ones_like(indices, dtype=int), indices, indptr),
            shape=(n_rows, n_cols))
        if max(graph.biadjacency.data) == 1:
            graph.biadjacency = graph.biadjacency.astype(bool)
    return graph

コード例 #2

0

ファイルを表示

ファイル: models.py プロジェクト: vishalbelsare/scikit-network

def star(n_branches: int = 3,
         metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Star (undirected).

    Parameters
    ----------
    n_branches : int
        Number of branches.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata (positions).

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import star
    >>> adjacency = star()
    >>> adjacency.shape
    (4, 4)
    """
    edges = [(0, i + 1) for i in range(n_branches)]
    adjacency = edgelist2adjacency(edges, undirected=True)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        angles = 2 * np.pi * np.arange(n_branches) / n_branches
        graph.position = np.vstack([np.cos(angles), np.sin(angles)]).T
        return graph
    else:
        return adjacency

コード例 #3

0

ファイルを表示

ファイル: toy_graphs.py プロジェクト: vishalbelsare/scikit-network

def painters(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Graph of links between some famous painters on Wikipedia.

    * Directed graph
    * 14 nodes, 50 edges
    * Names of painters

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (names, positions).

    Example
    -------
    >>> from sknetwork.data import painters
    >>> adjacency = painters()
    >>> adjacency.shape
    (14, 14)
    """
    row = np.array([
        0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
        12, 12, 12, 12, 13, 13
    ])
    col = np.array([
        3, 10, 3, 12, 9, 0, 1, 7, 11, 12, 2, 5, 9, 2, 4, 8, 9, 0, 13, 1, 2, 3,
        8, 11, 12, 0, 1, 4, 5, 7, 10, 11, 2, 4, 0, 3, 8, 11, 12, 0, 1, 3, 10,
        12, 1, 3, 4, 7, 6, 8
    ])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)),
                                  shape=(14, 14))

    if metadata:
        names = np.array([
            'Pablo Picasso', 'Claude Monet', 'Michel Angelo', 'Edouard Manet',
            'Peter Paul Rubens', 'Rembrandt', 'Gustav Klimt', 'Edgar Degas',
            'Vincent van Gogh', 'Leonardo da Vinci', 'Henri Matisse',
            'Paul Cezanne', 'Pierre-Auguste Renoir', 'Egon Schiele'
        ])
        x = np.array([
            0.24, -0.47, -0.3, -0.31, -0.08, 0.12, 0.78, -0.36, 0.11, -0.06,
            -0.02, -0.12, -0.24, 0.73
        ])
        y = np.array([
            0.53, 0.19, -0.71, 0.44, -0.48, -0.65, 0.69, -0.11, 0.01, -1.,
            0.49, 0.28, 0.06, 0.27
        ])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.names = names
        graph.position = np.stack((x, y)).T
        graph.name = 'painters'
        return graph
    else:
        return adjacency

コード例 #4

0

ファイルを表示

def cyclic_digraph(n: int = 3,
                   metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Cyclic graph (directed).

    Parameters
    ----------
    n : int
        Number of nodes.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import cyclic_digraph
    >>> adjacency = cyclic_digraph(5)
    >>> adjacency.shape
    (5, 5)
    """
    row = np.arange(n)
    col = np.array(list(np.arange(1, n)) + [0])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)),
                                  shape=(n, n))

    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = cyclic_position(n)
        return graph
    else:
        return adjacency

コード例 #5

0

ファイルを表示

ファイル: load.py プロジェクト: brunoasouza/scikit-network

def save(folder: str, data: Union[sparse.csr_matrix, Bunch]):
    """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster
    subsequent loads.

    Parameters
    ----------
    folder : str
        The name to be used for the bundle folder
    data : Union[sparse.csr_matrix, Bunch]
        The data to save

    Example
    -------
    >>> from sknetwork.data import save
    >>> graph = Bunch()
    >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2)
    >>> graph.names = np.array(list('abcdefghij'))
    >>> save('random_data', graph)
    >>> 'random_data' in listdir('.')
    True
    """
    folder = expanduser(folder)
    if exists(folder):
        shutil.rmtree(folder)
    if isinstance(data, sparse.csr_matrix):
        bunch = Bunch()
        if is_square(data):
            bunch.adjacency = data
        else:
            bunch.biadjacency = data
        data = bunch
    if isabs(folder):
        save_to_numpy_bundle(data, folder, '')
    else:
        save_to_numpy_bundle(data, folder, './')

コード例 #6

0

ファイルを表示

def watts_strogatz(n: int = 100, degree: int = 6, prob: float = 0.05, metadata: bool = False) \
    -> Union[sparse.csr_matrix, Bunch]:
    """Watts-Strogatz model.

    Parameters
    ----------
    n : int
        Number of nodes.
    degree : int
        Initial degree of nodes.
    prob : prob
        Probability of edge modification.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.
    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import watts_strogatz
    >>> adjacency = watts_strogatz(30, 4, 0.02)
    >>> adjacency.shape
    (30, 30)

    References
    ----------
    Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature.
    """
    edges = np.array([(i, (i + j + 1) % n) for i in range(n)
                      for j in range(degree // 2)])
    row, col = edges[:, 0], edges[:, 1]
    adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)),
                                  shape=(n, n))
    adjacency = sparse.lil_matrix(adjacency + adjacency.T)
    set_reference = set(np.arange(n))
    for i in range(n):
        candidates = list(set_reference - set(adjacency.rows[i]) - {i})
        for j in adjacency.rows[i]:
            if np.random.random() < prob:
                node = np.random.choice(candidates)
                adjacency[i, node] = 1
                adjacency[node, i] = 1
                adjacency[i, j] = 0
                adjacency[j, i] = 0
    adjacency = sparse.csr_matrix(adjacency)
    if metadata:
        t = 2 * pi * np.arange(n).astype(float) / n
        x = np.cos(t)
        y = np.sin(t)
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.array((x, y)).T
        return graph
    else:
        return adjacency

コード例 #7

0

ファイルを表示

ファイル: toy_graphs.py プロジェクト: vishalbelsare/scikit-network

def house(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """House graph.

    * Undirected graph
    * 5 nodes, 6 edges

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> adjacency.shape
    (5, 5)

    """
    row = np.array([0, 0, 1, 1, 2, 3])
    col = np.array([1, 4, 2, 4, 3, 4])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)),
                                  shape=(5, 5))
    adjacency = (adjacency + adjacency.T).astype(bool)

    if metadata:
        x = np.array([0, -1, -1, 1, 1])
        y = np.array([2, 1, -1, -1, 1])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.vstack((x, y)).T
        graph.name = 'house'
        return graph
    else:
        return adjacency

コード例 #8

0

ファイルを表示

def grid(n1: int = 10,
         n2: int = 10,
         metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Grid (undirected).

    Parameters
    ----------
    n1, n2 : int
        Grid dimension.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import grid
    >>> adjacency = grid(10, 5)
    >>> adjacency.shape
    (50, 50)
    """
    nodes = [(i1, i2) for i1 in range(n1) for i2 in range(n2)]
    edges = [((i1, i2), (i1 + 1, i2)) for i1 in range(n1 - 1)
             for i2 in range(n2)]
    edges += [((i1, i2), (i1, i2 + 1)) for i1 in range(n1)
              for i2 in range(n2 - 1)]
    node_id = {u: i for i, u in enumerate(nodes)}
    edges = list(map(lambda edge: (node_id[edge[0]], node_id[edge[1]]), edges))
    adjacency = edgelist2adjacency(edges, undirected=True)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.array(nodes)
        return graph
    else:
        return adjacency

コード例 #9

0

ファイルを表示

def save(folder: Union[str, Path], data: Union[sparse.csr_matrix, Bunch]):
    """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster
    subsequent loads. Supported attribute types include sparse matrices, NumPy arrays, strings and Bunch.

    Parameters
    ----------
    folder : str or :class:`pathlib.Path`
        The name to be used for the bundle folder
    data : Union[sparse.csr_matrix, Bunch]
        The data to save

    Example
    -------
    >>> from sknetwork.data import save
    >>> graph = Bunch()
    >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2)
    >>> graph.names = np.array(list('abcdefghij'))
    >>> save('random_data', graph)
    >>> 'random_data' in listdir('.')
    True
    """
    folder = Path(folder)
    folder = folder.expanduser()
    if folder.exists():
        shutil.rmtree(folder)
    if isinstance(data, sparse.csr_matrix):
        bunch = Bunch()
        if is_square(data):
            bunch.adjacency = data
        else:
            bunch.biadjacency = data
        data = bunch
    if folder.is_absolute():
        save_to_numpy_bundle(data, folder, '/')
    else:
        save_to_numpy_bundle(data, folder, '.')

コード例 #10

0

ファイルを表示

ファイル: loading.py プロジェクト: Zhao-hangtian/scikit-network

def load_konect_dataset(dataset_name: str, data_home: Optional[str] = None, auto_numpy_bundle: bool = True):
    """
    Loads a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset_name: str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home: str
        The folder to be used for dataset storage
    auto_numpy_bundle: bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster subsequent
        access to the dataset (True).

    Returns
    -------
    data: :class:`Bunch`
        An object with the following attributes:

         * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
         * `meta`: a dictionary containing the metadata as specified by Konect
         * any attribute described in an ent.* file

    """
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset_name + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.uni-koblenz.de/downloads/tsv/' + dataset_name + '.tar.bz2',
                        data_home + '/' + dataset_name + '.tar.bz2')
        except HTTPError:
            raise ValueError('Invalid dataset ' + dataset_name)
        with tarfile.open(data_home + '/' + dataset_name + '.tar.bz2', 'r:bz2') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset_name + '.tar.bz2')
    elif exists(data_path + '/' + dataset_name + '_bundle'):
        return load_from_numpy_bundle(dataset_name + '_bundle', data_path)

    data = Bunch()
    files = [file for file in listdir(data_path) if dataset_name in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = parse_header(data_path + file)
        if bipartite:
            data.biadjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0]
        else:
            data.adjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0]

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = parse_metadata(data_path + file)

    attributes = [file for file in files if 'ent.' + dataset_name in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = parse_labels(data_path + file)

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset_name + '_bundle', data_path)

    return data

コード例 #11

0

ファイルを表示

def watts_strogatz(n: int = 100,
                   degree: int = 6,
                   prob: float = 0.05,
                   seed: Optional[int] = None,
                   metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Watts-Strogatz model.

    Parameters
    ----------
    n :
        Number of nodes.
    degree :
        Initial degree of nodes.
    prob :
        Probability of edge modification.
    seed :
        Seed of the random generator (optional).
    metadata :
        If ``True``, return a `Bunch` object with metadata.
    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import watts_strogatz
    >>> adjacency = watts_strogatz(30, 4, 0.02)
    >>> adjacency.shape
    (30, 30)

    References
    ----------
    Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature.
    """
    np.random.seed(seed)
    edges = np.array([(i, (i + j + 1) % n) for i in range(n)
                      for j in range(degree // 2)])
    row, col = edges[:, 0], edges[:, 1]
    adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)),
                                  shape=(n, n))
    adjacency = sparse.lil_matrix(adjacency + adjacency.T)
    nodes = np.arange(n)
    for i in range(n):
        neighbors = adjacency.rows[i]
        candidates = list(set(nodes) - set(neighbors) - {i})
        for j in neighbors:
            if np.random.random() < prob:
                node = np.random.choice(candidates)
                adjacency[i, node] = 1
                adjacency[node, i] = 1
                adjacency[i, j] = 0
                adjacency[j, i] = 0
    adjacency = sparse.csr_matrix(adjacency, shape=adjacency.shape)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = cyclic_position(n)
        return graph
    else:
        return adjacency

コード例 #12

0

ファイルを表示

def load_konect(dataset: str, data_home: Optional[Union[str, Path]] = None, auto_numpy_bundle: bool = True,
                verbose: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.cc/networks/>`_.

    Parameters
    ----------
    dataset : str
        The internal name of the dataset as specified on the Konect website (e.g. for the Zachary Karate club dataset,
        the corresponding name is ``'ucidata-zachary'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).
    verbose : bool
        Enable verbosity.

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.

    References
    ----------
    Kunegis, J. (2013, May).
    `Konect: the Koblenz network collection.
    <https://dl.acm.org/doi/abs/10.1145/2487788.2488173>`_
    In Proceedings of the 22nd International Conference on World Wide Web (pp. 1343-1350).
    """
    logger = Log(verbose)
    if dataset == '':
        raise ValueError("Please specify the dataset. "
                         + "\nExamples include 'actor-movie' and 'ego-facebook'."
                         + "\n See 'http://konect.cc/networks/' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        logger.print('Downloading', dataset, 'from Konect...')
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.cc/files/download.tsv.' + dataset + '.tar.bz2',
                        data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'), 'r:bz2') as tar_ref:
                logger.print('Unpacking archive...')
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError('Invalid dataset ' + dataset + '.'
                             + "\nExamples include 'actor-movie' and 'ego-facebook'."
                             + "\n See 'http://konect.cc/networks/' for the full list.")
        except (URLError, ConnectionResetError):  # pragma: no cover
            rmdir(data_path)
            raise RuntimeError("Could not reach Konect.")
        finally:
            if exists(data_home / (dataset + '.tar.bz2')):
                remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        logger.print('Loading from local bundle...')
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]
    logger.print('Parsing files...')
    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data

コード例 #13

0

ファイルを表示

ファイル: models.py プロジェクト: brunoasouza/scikit-network

def block_model(sizes: np.ndarray,
                p_in: Union[float, list, np.ndarray] = .2,
                p_out: float = .05,
                seed: Optional[int] = None,
                metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Stochastic block model.

    Parameters
    ----------
    sizes :
         Block sizes.
    p_in :
        Probability of connection within blocks.
    p_out :
        Probability of connection across blocks (must be less than **p_in**).
    seed : Optional[int]
        Random seed.
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (labels).

    Example
    -------
    >>> from sknetwork.data import block_model
    >>> sizes = np.array([4, 5])
    >>> adjacency = block_model(sizes)
    >>> adjacency.shape
    (9, 9)

    References
    ----------
    Airoldi, E.,  Blei, D., Feinberg, S., Xing, E. (2007).
    `Mixed membership stochastic blockmodels. <https://arxiv.org/abs/0803.0476>`_
    Journal of Machine Learning Research.
    """
    np.random.seed(seed)
    sizes = np.array(sizes)

    if type(p_in) != np.ndarray:
        p_in = p_in * np.ones_like(sizes)
    if p_in.min() < p_out:
        raise ValueError(
            'The probability of connection across blocks p_out must be less that the probability of '
            'connection within a block p_in.')

    # each edge is considered twice
    p_in = p_in / 2
    p_out = p_out / 2

    p_diff = p_in - p_out
    blocks_in = [(sparse.random(s, s, p_diff[k]) > 0)
                 for k, s in enumerate(sizes)]
    adjacency_in = sparse.block_diag(blocks_in)
    n = sizes.sum()
    adjacency_out = sparse.random(n, n, p_out) > 0
    adjacency = sparse.lil_matrix(adjacency_in + adjacency_out)
    adjacency.setdiag(0)
    adjacency = adjacency + adjacency.T
    adjacency = sparse.csr_matrix(adjacency).astype(int)

    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        labels = np.repeat(np.arange(len(sizes)), sizes)
        graph.labels = labels
        return graph
    else:
        return adjacency

コード例 #14

0

ファイルを表示

def load_netset(dataset: str, data_home: Optional[str] = None) -> Bunch:
    """Load a dataset from the `NetSets database
    <https://graphs.telecom-paristech.fr/>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset (all low-case). Examples include 'openflights', 'cinema' and 'wikivitals'.
    data_home : str
        The folder to be used for dataset storage.

    Returns
    -------
    graph : :class:`Bunch`

    Example
    -------
    >>> from sknetwork.data import load_netset
    >>> graph = load_netset('openflights')
    >>> graph.adjacency.shape
    (3097, 3097)
    """
    if dataset == '':
        raise ValueError(
            "Please specify the dataset (e.g., 'openflights' or 'wikivitals')."
        )
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                "https://graphs.telecom-paristech.fr/datasets_npz/" + dataset +
                '_npz.tar.gz', data_home + '/' + dataset + '_npz.tar.gz')
        except HTTPError:
            rmdir(data_home + '/' + dataset)
            raise ValueError(
                'Invalid dataset ' + dataset + '.' +
                "\nAvailable datasets include 'openflights' and 'wikivitals'."
                + "\nSee <https://graphs.telecom-paristech.fr/>")
        with tarfile.open(data_home + '/' + dataset + '_npz.tar.gz',
                          'r:gz') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset + '_npz.tar.gz')

    graph = Bunch()
    files = [file for file in listdir(data_path)]

    if 'adjacency.npz' in files:
        graph.adjacency = sparse.load_npz(data_path + '/adjacency.npz')
    if 'biadjacency.npz' in files:
        graph.biadjacency = sparse.load_npz(data_path + '/biadjacency.npz')
    if 'names.npy' in files:
        graph.names = np.load(data_path + '/names.npy')
    if 'names_row.npy' in files:
        graph.names_row = np.load(data_path + '/names_row.npy')
    if 'names_col.npy' in files:
        graph.names_col = np.load(data_path + '/names_col.npy')
    if 'labels.npy' in files:
        graph.labels = np.load(data_path + '/labels.npy')
    if 'labels_row.npy' in files:
        graph.labels_row = np.load(data_path + '/labels_row.npy')
    if 'labels_col.npy' in files:
        graph.labels_col = np.load(data_path + '/labels_col.npy')
    if 'labels_hierarchy.npy' in files:
        graph.labels_hierarchy = np.load(data_path + '/labels_hierarchy.npy')
    if 'names_labels.npy' in files:
        graph.names_labels = np.load(data_path + '/names_labels.npy')
    if 'names_labels_hierarchy.npy' in files:
        graph.names_labels_hierarchy = np.load(data_path +
                                               '/names_labels_hierarchy.npy')
    if 'position.npy' in files:
        graph.position = np.load(data_path + '/position.npy')

    graph.meta = Bunch()
    graph.meta.name = dataset

    return graph

コード例 #15

0

ファイルを表示

ファイル: parse.py プロジェクト: vintasoftware/scikit-network

def load_graphml(file: str,
                 weight_key: str = 'weight',
                 max_string_size: int = 512) -> Bunch:
    """Parse GraphML datasets.

    Hyperedges and nested graphs are not supported.

    Parameters
    ----------
    file: str
        The path to the dataset
    weight_key: str
        The key to be used as a value for edge weights
    max_string_size: int
        The maximum size for string features of the data

    Returns
    -------
    data: :class:`Bunch`
        The dataset in a bunch with the adjacency as a CSR matrix.
    """
    # see http://graphml.graphdrawing.org/primer/graphml-primer.html
    # and http://graphml.graphdrawing.org/specification/dtd.html#top
    tree = ElementTree.parse(file)
    n_nodes = 0
    n_edges = 0
    symmetrize = None
    naming_nodes = True
    default_weight = 1
    weight_type = bool
    weight_id = None
    # indices in the graph tree
    node_indices = []
    edge_indices = []
    data = Bunch()
    graph = None
    file_description = None
    attribute_descriptions = Bunch()
    attribute_descriptions.node = Bunch()
    attribute_descriptions.edge = Bunch()
    keys = {}
    for file_element in tree.getroot():
        if file_element.tag.endswith('graph'):
            graph = file_element
            symmetrize = (graph.attrib['edgedefault'] == 'undirected')
            for index, element in enumerate(graph):
                if element.tag.endswith('node'):
                    node_indices.append(index)
                    n_nodes += 1
                elif element.tag.endswith('edge'):
                    edge_indices.append(index)
                    if 'directed' in element.attrib:
                        if element.attrib['directed'] == 'true':
                            n_edges += 1
                        else:
                            n_edges += 2
                    elif symmetrize:
                        n_edges += 2
                    else:
                        n_edges += 1
            if 'parse.nodeids' in graph.attrib:
                naming_nodes = not (graph.attrib['parse.nodeids']
                                    == 'canonical')
    for file_element in tree.getroot():
        if file_element.tag.endswith('key'):
            attribute_name = file_element.attrib['attr.name']
            attribute_type = java_type_to_python_type(
                file_element.attrib['attr.type'])
            if attribute_name == weight_key:
                weight_type = java_type_to_python_type(
                    file_element.attrib['attr.type'])
                weight_id = file_element.attrib['id']
                for key_element in file_element:
                    if key_element.tag == 'default':
                        default_weight = attribute_type(key_element.text)
            else:
                default_value = None
                if file_element.attrib['for'] == 'node':
                    size = n_nodes
                    if 'node_attribute' not in data:
                        data.node_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.node[
                                attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.node_attribute[attribute_name] = np.full(
                            size, default_value, dtype=local_type)
                    else:
                        data.node_attribute[attribute_name] = np.zeros(
                            size, dtype=local_type)
                elif file_element.attrib['for'] == 'edge':
                    size = n_edges
                    if 'edge_attribute' not in data:
                        data.edge_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.edge[
                                attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.edge_attribute[attribute_name] = np.full(
                            size, default_value, dtype=local_type)
                    else:
                        data.edge_attribute[attribute_name] = np.zeros(
                            size, dtype=local_type)
                keys[file_element.attrib['id']] = [
                    attribute_name, attribute_type
                ]
        elif file_element.tag.endswith('desc'):
            file_description = file_element.text
    if file_description or attribute_descriptions.node or attribute_descriptions.edge:
        data.meta = Bunch()
        if file_description:
            data.meta['description'] = file_description
        if attribute_descriptions.node or attribute_descriptions.edge:
            data.meta['attributes'] = attribute_descriptions
    if graph is not None:
        row = np.zeros(n_edges, dtype=int)
        col = np.zeros(n_edges, dtype=int)
        dat = np.full(n_edges, default_weight, dtype=weight_type)
        data.names = None
        if naming_nodes:
            data.names = np.zeros(n_nodes, dtype='<U512')

        node_map = {}
        # deal with nodes first
        for number, index in enumerate(node_indices):
            node = graph[index]
            if naming_nodes:
                name = node.attrib['id']
                data.names[number] = name
                node_map[name] = number
            for node_attribute in node:
                if node_attribute.tag.endswith('data'):
                    data.node_attribute[keys[node_attribute.attrib['key']][0]][number] = \
                        keys[node_attribute.attrib['key']][1](node_attribute.text)
        # deal with edges
        edge_index = -1
        for index in edge_indices:
            edge_index += 1
            duplicate = False
            edge = graph[index]
            if naming_nodes:
                node1 = node_map[edge.attrib['source']]
                node2 = node_map[edge.attrib['target']]
            else:
                node1 = int(edge.attrib['source'][1:])
                node2 = int(edge.attrib['target'][1:])
            row[edge_index] = node1
            col[edge_index] = node2
            for edge_attribute in edge:
                if edge_attribute.tag.endswith('data'):
                    if edge_attribute.attrib['key'] == weight_id:
                        dat[edge_index] = weight_type(edge_attribute.text)
                    else:
                        data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                            keys[edge_attribute.attrib['key']][1](edge_attribute.text)
            if 'directed' in edge.attrib:
                if edge.attrib['directed'] != 'true':
                    duplicate = True
            elif symmetrize:
                duplicate = True
            if duplicate:
                edge_index += 1
                row[edge_index] = node2
                col[edge_index] = node1
                for edge_attribute in edge:
                    if edge_attribute.tag.endswith('data'):
                        if edge_attribute.attrib['key'] == weight_id:
                            dat[edge_index] = weight_type(edge_attribute.text)
                        else:
                            data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                                keys[edge_attribute.attrib['key']][1](edge_attribute.text)
        data.adjacency = sparse.csr_matrix((dat, (row, col)),
                                           shape=(n_nodes, n_nodes))
        if data.names is None:
            data.pop('names')
        return data
    else:
        raise ValueError(f'No graph defined in {file}.')

コード例 #16

0

ファイルを表示

def block_model(sizes: Iterable, p_in: Union[float, list, np.ndarray] = .2, p_out: float = .05,
                random_state: Optional[int] = None, metadata: bool = False) \
                -> Union[sparse.csr_matrix, Bunch]:
    """Stochastic block model.

    Parameters
    ----------
    sizes :
         Block sizes.
    p_in :
        Probability of connection within blocks.
    p_out :
        Probability of connection across blocks.
    random_state :
        Seed of the random generator (optional).
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (labels).

    Example
    -------
    >>> from sknetwork.data import block_model
    >>> sizes = np.array([4, 5])
    >>> adjacency = block_model(sizes)
    >>> adjacency.shape
    (9, 9)

    References
    ----------
    Airoldi, E.,  Blei, D., Feinberg, S., Xing, E. (2007).
    `Mixed membership stochastic blockmodels. <https://arxiv.org/pdf/0705.4485.pdf>`_
    Journal of Machine Learning Research.
    """
    np.random.seed(random_state)
    sizes = np.array(sizes)

    if isinstance(p_in, (np.floating, float)):
        p_in = p_in * np.ones_like(sizes)
    else:
        p_in = np.array(p_in)

    # each edge is considered twice
    p_in = p_in / 2

    matrix = []
    for i, a in enumerate(sizes):
        row = []
        for j, b in enumerate(sizes):
            if j < i:
                row.append(None)
            elif j > i:
                row.append(sparse.random(a, b, p_out, dtype=bool))
            else:
                row.append(sparse.random(a, a, p_in[i], dtype=bool))
        matrix.append(row)
    adjacency = sparse.bmat(matrix)
    adjacency.setdiag(0)
    adjacency = directed2undirected(adjacency.tocsr(), weighted=False)

    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        labels = np.repeat(np.arange(len(sizes)), sizes)
        graph.labels = labels
        return graph
    else:
        return adjacency

コード例 #17

0

ファイルを表示

def parse_tsv(file: str,
              directed: bool = False,
              bipartite: bool = False,
              weighted: Optional[bool] = None,
              named: Optional[bool] = None,
              comment: str = '%#',
              delimiter: str = None,
              reindex: bool = True) -> Bunch:
    """Parser for Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets.

    Parameters
    ----------
    file : str
        The path to the dataset in TSV format
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    weighted : Optional[bool]
        Retrieves the weights in the third field of the file. None makes a guess based on the first lines.
    named : Optional[bool]
        Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess
        based on the first lines.
    comment : str
        Set of characters denoting lines to ignore.
    delimiter : str
        delimiter used in the file. None makes a guess
    reindex : bool
        If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the
        maximum of those values. Does not work for bipartite graphs.

    Returns
    -------
    graph: :class:`Bunch`
    """
    reindexed = False
    header_len = -1
    possible_delimiters = ['\t', ',', ' ']
    del_count = np.zeros(3, dtype=int)
    lines = []
    row = comment
    with open(file, 'r', encoding='utf-8') as f:
        while row[0] in comment:
            row = f.readline()
            header_len += 1
        for line in range(3):
            for i, poss_del in enumerate(possible_delimiters):
                if poss_del in row:
                    del_count[i] += 1
            lines.append(row.rstrip())
            row = f.readline()
        lines = [line for line in lines if line != '']
        guess_delimiter = possible_delimiters[int(np.argmax(del_count))]
        guess_weighted = bool(
            min([line.count(guess_delimiter) for line in lines]) - 1)
        guess_named = not all([
            all([el.strip().isdigit()
                 for el in line.split(guess_delimiter)][0:2]) for line in lines
        ])
    if weighted is None:
        weighted = guess_weighted
    if named is None:
        named = guess_named
    if delimiter is None:
        delimiter = guess_delimiter

    row, col, data = [], [], []
    with open(file, 'r', encoding='utf-8') as f:
        for i in range(header_len):
            f.readline()
        csv_reader = reader(f, delimiter=delimiter)
        for line in csv_reader:
            if line[0] not in comment:
                if named:
                    row.append(line[0])
                    col.append(line[1])
                else:
                    row.append(int(line[0]))
                    col.append(int(line[1]))
                if weighted:
                    data.append(float(line[2]))
    n_edges = len(row)

    graph = Bunch()
    if bipartite:
        names_row, row = np.unique(row, return_inverse=True)
        names_col, col = np.unique(col, return_inverse=True)
        if not reindex:
            n_row = max(names_row) + 1
            n_col = max(names_col) + 1
        else:
            n_row = len(names_row)
            n_col = len(names_col)
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        biadjacency = sparse.csr_matrix((data, (row, col)),
                                        shape=(n_row, n_col))
        graph.biadjacency = biadjacency
        if named or reindex:
            graph.names = names_row
            graph.names_row = names_row
            graph.names_col = names_col
    else:
        nodes = np.concatenate((row, col), axis=None)
        names, new_nodes = np.unique(nodes, return_inverse=True)
        if not reindex:
            n_nodes = max(names) + 1
        else:
            n_nodes = len(names)
        if named:
            row = new_nodes[:n_edges]
            col = new_nodes[n_edges:]
        else:
            if not all(names == range(len(names))) and reindex:
                reindexed = True
                row = new_nodes[:n_edges]
                col = new_nodes[n_edges:]
        if not weighted:
            data = np.ones(n_edges, dtype=int)
        adjacency = sparse.csr_matrix((data, (row, col)),
                                      shape=(n_nodes, n_nodes))
        if not directed:
            adjacency += adjacency.T
        graph.adjacency = adjacency
        if named or reindexed:
            graph.names = names

    return graph

コード例 #18

0

ファイルを表示

ファイル: parse.py プロジェクト: vintasoftware/scikit-network

def from_edge_list(row: np.ndarray,
                   col: np.ndarray,
                   data: np.ndarray,
                   directed: bool = False,
                   bipartite: bool = False,
                   reindex: bool = True,
                   named: Optional[bool] = None) -> Bunch:
    """Turn an edge list given as a triplet of NumPy arrays into a :class:`Bunch`.

    Parameters
    ----------
    row : np.ndarray
        The array of sources in the graph.
    col : np.ndarray
        The array of targets in the graph.
    data : np.ndarray
        The array of weights in the graph. Pass an empty array for unweighted graphs.
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    reindex : bool
        If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the
        maximum of those values. Does not work for bipartite graphs.
    named : Optional[bool]
        Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess
        based on the first lines.

    Returns
    -------
    graph: :class:`Bunch`
    """
    reindexed = False
    if named is None:
        named = (row.dtype != int) or (col.dtype != int)
    weighted = bool(len(data))
    n_edges = len(row)
    graph = Bunch()
    if bipartite:
        names_row, row = np.unique(row, return_inverse=True)
        names_col, col = np.unique(col, return_inverse=True)
        if not reindex:
            n_row = names_row.max() + 1
            n_col = names_col.max() + 1
        else:
            n_row = len(names_row)
            n_col = len(names_col)
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        biadjacency = sparse.csr_matrix((data, (row, col)),
                                        shape=(n_row, n_col))
        graph.biadjacency = biadjacency
        if named or reindex:
            graph.names = names_row
            graph.names_row = names_row
            graph.names_col = names_col
    else:
        nodes = np.concatenate((row, col), axis=None)
        names, new_nodes = np.unique(nodes, return_inverse=True)
        if not reindex:
            n_nodes = names.max() + 1
        else:
            n_nodes = len(names)
        if named:
            row = new_nodes[:n_edges]
            col = new_nodes[n_edges:]
        else:
            should_reindex = not (names[0] == 0 and names[-1] == n_nodes - 1)
            if should_reindex and reindex:
                reindexed = True
                row = new_nodes[:n_edges]
                col = new_nodes[n_edges:]
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        adjacency = sparse.csr_matrix((data, (row, col)),
                                      shape=(n_nodes, n_nodes))
        if not directed:
            adjacency = directed2undirected(adjacency, weighted=weighted)
        graph.adjacency = adjacency
        if named or reindexed:
            graph.names = names

    return graph

コード例 #19

0

ファイルを表示

ファイル: load.py プロジェクト: brunoasouza/scikit-network

def load_konect(dataset: str,
                data_home: Optional[str] = None,
                auto_numpy_bundle: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home : str
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Example
    -------
    >>> from sknetwork.data import load_konect
    >>> graph = load_konect('dolphins')
    >>> graph.adjacency.shape
    (62, 62)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.
    """
    if dataset == '':
        raise ValueError(
            "Please specify the dataset. " +
            "\nExamples include 'actor-movie' and 'ego-facebook'." +
            "\n See 'http://konect.uni-koblenz.de' for the full list.")
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                'http://konect.uni-koblenz.de/downloads/tsv/' + dataset +
                '.tar.bz2', data_home + '/' + dataset + '.tar.bz2')
            with tarfile.open(data_home + '/' + dataset + '.tar.bz2',
                              'r:bz2') as tar_ref:
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_home + '/' + dataset)
            raise ValueError(
                'Invalid dataset ' + dataset + '.' +
                "\nExamples include 'actor-movie' and 'ego-facebook'." +
                "\n See 'http://konect.uni-koblenz.de' for the full list.")
        finally:
            remove(data_home + '/' + dataset + '.tar.bz2')
    elif exists(data_path + '/' + dataset + '_bundle'):
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = parse_header(data_path + file)
        if bipartite:
            graph = parse_tsv(data_path + file,
                              directed=directed,
                              bipartite=bipartite,
                              weighted=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = parse_tsv(data_path + file,
                              directed=directed,
                              bipartite=bipartite,
                              weighted=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = parse_metadata(data_path + file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = parse_labels(data_path + file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data

コード例 #20

0

ファイルを表示

ファイル: loading.py プロジェクト: Zhao-hangtian/scikit-network

def load_wikilinks_dataset(dataset_name: str, data_home: Optional[str] = None,
                           max_depth: int = 1, full_path: bool = True):
    """
    Loads a dataset from the `WikiLinks database
    <https://graphs.telecom-paristech.fr/Home_page.html#wikilinks-section>`_.

    Parameters
    ----------
    dataset_name: str
        The name of the dataset (all lowcase). Currently, 'wikivitals' and 'wikihumans' are available.
    data_home: str
        The folder to be used for dataset storage
    max_depth: int
        Denotes the maximum depth to use for the categories (if relevant)
    full_path: bool
        Denotes if only the deepest label possible should be returned or if all super categories should
        be considered (if relevant)

    Returns
    -------
    data: :class:`Bunch`
        An object with some of the following attributes (depending on the dataset):

         * `adjacency`: the adjacency matrix of the graph in CSR format
         * `biadjacency`: the biadjacency matrix of the graph in CSR format
         * `feature_names`: the array of the names for the features
         * `names`: the titles of the articles
         * `target_names`: the categories of the articles as specified with `max_depth` and `full_path`
         * `target`: the index for `target_names`

    """
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset_name + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve("https://graphs.telecom-paristech.fr/npz_datasets/" + dataset_name + '_npz.tar.gz',
                        data_home + '/' + dataset_name + '_npz.tar.gz')
        except HTTPError:
            raise ValueError('Invalid dataset ' + dataset_name)
        with tarfile.open(data_home + '/' + dataset_name + '_npz.tar.gz', 'r:gz') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset_name + '_npz.tar.gz')

    data = Bunch()
    files = [file for file in listdir(data_path)]

    if 'adjacency.npz' in files:
        data.adjacency = sparse.load_npz(data_path + '/adjacency.npz')
    if 'biadjacency.npz' in files:
        data.biadjacency = sparse.load_npz(data_path + '/biadjacency.npz')
    if 'names.npy' in files:
        data.names = np.load(data_path + '/names.npy')
    if 'feature_names.npy' in files:
        data.feature_names = np.load(data_path + '/feature_names.npy')
    if 'target_names.npy' in files:
        tmp_target_names = np.load(data_path + '/target_names.npy')
        tags = []
        for tag in tmp_target_names:
            parts = tag.strip().split('.')
            if full_path:
                tags.append(".".join(parts[:min(max_depth, len(parts))]))
            else:
                tags.append(parts[:min(max_depth, len(parts))][-1])
        data.target_names = np.array(tags)
        _, data.target = np.unique(data.target_names, return_inverse=True)

    return data

コード例 #21

0

ファイルを表示

def load_edge_list(file: str, directed: bool = False, bipartite: bool = False, weighted: Optional[bool] = None,
                   named: Optional[bool] = None, comment: str = '%#', delimiter: str = None, reindex: bool = True,
                   fast_format: bool = True) -> Bunch:
    """Parser for Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets in the form of
    edge lists.

    Parameters
    ----------
    file : str
        The path to the dataset in TSV format
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    weighted : Optional[bool]
        Retrieves the weights in the third field of the file. None makes a guess based on the first lines.
    named : Optional[bool]
        Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess
        based on the first lines.
    comment : str
        Set of characters denoting lines to ignore.
    delimiter : str
        delimiter used in the file. None makes a guess
    reindex : bool
        If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the
        maximum of those values. Does not work for bipartite graphs.
    fast_format : bool
        If True, assumes that the file is well-formatted:

        * no comments except for the header
        * only 2 or 3 columns
        * only int or float values

    Returns
    -------
    graph: :class:`Bunch`
    """
    reindexed = False
    header_len, guess_delimiter, guess_weighted, guess_named, guess_string_present, guess_type = scan_header(file,
                                                                                                             comment)

    if weighted is None:
        weighted = guess_weighted
    if named is None:
        named = guess_named
    if delimiter is None:
        delimiter = guess_delimiter

    with open(file, 'r', encoding='utf-8') as f:
        for i in range(header_len):
            f.readline()
        if fast_format and not guess_string_present:
            # fromfile raises a DeprecationWarning on fail. This should be changed to ValueError in the future.
            warnings.filterwarnings("error")
            try:
                parsed = np.fromfile(f, sep=guess_delimiter, dtype=guess_type)
            except (DeprecationWarning, ValueError):
                raise ValueError('File not suitable for fast parsing. Set fast_format to False.')
            warnings.filterwarnings("default")
            n_entries = len(parsed)
            if weighted:
                parsed.resize((n_entries//3, 3))
                row, col, data = parsed[:, 0], parsed[:, 1], parsed[:, 2]
            else:
                parsed.resize((n_entries//2, 2))
                row, col = parsed[:, 0], parsed[:, 1]
                data = np.ones(row.shape[0], dtype=bool)
        else:
            row, col, data = [], [], []
            csv_reader = reader(f, delimiter=delimiter)
            for line in csv_reader:
                if line[0] not in comment:
                    if named:
                        row.append(line[0])
                        col.append(line[1])
                    else:
                        row.append(int(line[0]))
                        col.append(int(line[1]))
                    if weighted:
                        data.append(float(line[2]))
    n_edges = len(row)

    graph = Bunch()
    if bipartite:
        names_row, row = np.unique(row, return_inverse=True)
        names_col, col = np.unique(col, return_inverse=True)
        if not reindex:
            n_row = names_row.max() + 1
            n_col = names_col.max() + 1
        else:
            n_row = len(names_row)
            n_col = len(names_col)
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        biadjacency = sparse.csr_matrix((data, (row, col)), shape=(n_row, n_col))
        graph.biadjacency = biadjacency
        if named or reindex:
            graph.names = names_row
            graph.names_row = names_row
            graph.names_col = names_col
    else:
        nodes = np.concatenate((row, col), axis=None)
        names, new_nodes = np.unique(nodes, return_inverse=True)
        if not reindex:
            n_nodes = names.max() + 1
        else:
            n_nodes = len(names)
        if named:
            row = new_nodes[:n_edges]
            col = new_nodes[n_edges:]
        else:
            should_reindex = not (names[0] == 0 and names[-1] == n_nodes - 1)
            if should_reindex and reindex:
                reindexed = True
                row = new_nodes[:n_edges]
                col = new_nodes[n_edges:]
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        adjacency = sparse.csr_matrix((data, (row, col)), shape=(n_nodes, n_nodes))
        if not directed:
            adjacency = directed2undirected(adjacency, weighted=weighted)
        graph.adjacency = adjacency
        if named or reindexed:
            graph.names = names

    return graph

コード例 #22

0

ファイルを表示

def karate_club(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Karate club graph.

    * Undirected graph
    * 34 nodes, 78 edges
    * 2 labels

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (labels, positions).

    Example
    -------
    >>> from sknetwork.data import karate_club
    >>> adjacency = karate_club()
    >>> adjacency.shape
    (34, 34)

    References
    ----------
    Zachary's karate club graph
    https://en.wikipedia.org/wiki/Zachary%27s_karate_club
    """
    row = np.array(
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
         3, 4, 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, 14, 15, 15, 18,
         18, 19, 20, 20, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26,
         27, 28, 28, 29, 29, 30, 30, 31, 31, 32])
    col = np.array(
        [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2,
         3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12,
         13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32, 33, 32,
         33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 31, 29, 33,
         33, 31, 33, 32, 33, 32, 33, 32, 33, 33])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)), shape=(34, 34))
    adjacency = sparse.csr_matrix(adjacency + adjacency.T, dtype=bool)

    if metadata:
        labels = np.array(
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        x = np.array(
            [0.04,  0.24,  0.01,  0.13,  0.02, -0.08,  0.04,  0.21,  0.08, -0.11, -0.13, -0.28,  0.2,  0.08,
             0.23,  0.06, -0.06,  0.32, 0.15,  0.19,  0.27,  0.39, -0.04, -0.26, -0.51, -0.49, -0.19, -0.28,
             -0.11, -0.17,  0.22, -0.21,  0.03, 0])
        y = np.array(
            [-0.33, -0.15, -0.01, -0.28, -0.64, -0.75, -0.76, -0.25,  0.09, 0.23, -0.62, -0.4, -0.53, -0.07,
             0.55,  0.64, -1., -0.42, 0.6, -0.01,  0.45, -0.34,  0.61,  0.41,  0.14,  0.28,  0.68, 0.21,
             0.12,  0.54,  0.19,  0.09,  0.38,  0.33])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.labels = labels
        graph.position = np.vstack((x, y)).T
        graph.name = 'karate_club'
        return graph
    else:
        return adjacency

コード例 #23

0

ファイルを表示

def miserables(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Co-occurrence graph of the characters in the novel Les miserables by Victor Hugo.

    * Undirected graph
    * 77 nodes, 508 edges
    * Names of characters

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (names, positions).

    Example
    -------
    >>> from sknetwork.data import miserables
    >>> adjacency = miserables()
    >>> adjacency.shape
    (77, 77)
    """
    row = np.array(
        [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  3, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
         16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19,
         20, 20, 20, 21, 21, 22, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
         25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
         27, 27, 27, 27, 27, 28, 28, 29, 29, 29, 29, 29, 30, 34, 34, 34, 34, 35, 35, 35, 36, 36, 37, 39,
         39, 41, 41, 41, 41, 41, 41, 41, 41, 41, 46, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
         48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 51, 51, 51, 51, 54, 55, 55, 55, 55, 55, 55, 55, 55,
         55, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59,
         59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64,
         64, 65, 65, 66, 68, 68, 68, 68, 69, 69, 69, 70, 70, 71, 73])
    col = np.array(
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 3, 11, 11, 11, 12, 13, 14,
         15, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 43,
         44, 48, 49, 51, 55, 58, 64, 68, 69, 70, 71, 72, 23, 17, 18, 19, 20,
         21, 22, 23, 26, 55, 18, 19, 20, 21, 22, 23, 19, 20, 21, 22, 23, 20,
         21, 22, 23, 21, 22, 23, 22, 23, 23, 24, 25, 27, 29, 30, 31, 25, 26,
         27, 41, 42, 50, 68, 69, 70, 26, 27, 39, 40, 41, 42, 48, 55, 68, 69,
         70, 71, 75, 27, 43, 49, 51, 54, 55, 72, 28, 29, 31, 33, 43, 48, 58,
         68, 69, 70, 71, 72, 44, 45, 34, 35, 36, 37, 38, 31, 35, 36, 37, 38,
         36, 37, 38, 37, 38, 38, 52, 55, 42, 55, 57, 62, 68, 69, 70, 71, 75,
         47, 48, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 71, 73,
         74, 75, 76, 50, 51, 54, 55, 56, 52, 53, 54, 55, 55, 56, 57, 58, 59,
         61, 62, 63, 64, 65, 58, 59, 61, 62, 63, 64, 65, 67, 59, 60, 61, 62,
         63, 64, 65, 66, 70, 76, 60, 61, 62, 63, 64, 65, 66, 61, 62, 63, 64,
         65, 66, 62, 63, 64, 65, 66, 63, 64, 65, 66, 76, 64, 65, 66, 76, 65,
         66, 76, 66, 76, 76, 69, 70, 71, 75, 70, 71, 75, 71, 75, 75, 74])
    data = np.array(
        [1, 8, 10, 1, 1, 1, 1, 2, 1, 5, 6, 3, 3, 1, 1, 1, 1,
         1, 9, 7, 12, 31, 17, 8, 2, 3, 1, 2, 3, 3, 2, 2, 2, 3,
         1, 1, 2, 2, 19, 4, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 3,
         3, 3, 3, 1, 1, 4, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4,
         3, 3, 3, 5, 4, 4, 4, 4, 4, 2, 1, 5, 1, 1, 2, 13, 4,
         1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 3, 2, 1, 2, 5, 6,
         4, 1, 3, 1, 1, 3, 2, 1, 21, 2, 1, 1, 1, 1, 1, 1, 6,
         1, 2, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 2, 3, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1,
         1, 2, 4, 1, 7, 6, 1, 2, 7, 5, 5, 3, 1, 1, 1, 1, 2,
         2, 1, 1, 1, 9, 1, 12, 1, 1, 1, 2, 6, 1, 1, 1, 7, 5,
         1, 9, 1, 5, 2, 1, 2, 1, 2, 2, 1, 1, 3, 15, 4, 6, 17,
         4, 10, 5, 3, 1, 1, 2, 5, 13, 5, 9, 5, 1, 2, 3, 2, 2,
         2, 1, 6, 3, 6, 5, 1, 6, 12, 5, 2, 1, 4, 5, 1, 1, 7,
         3, 1, 2, 1, 1, 6, 4, 2, 3, 4, 2, 3, 2, 1, 1, 3])
    adjacency = sparse.csr_matrix((data, (row, col)), shape=(77, 77))
    adjacency = adjacency + adjacency.T

    if metadata:
        names = ['Myriel', 'Napoleon', 'Mlle Baptistine', 'Mme Magloire', 'Countess de Lo', 'Geborand',
                 'Champtercier', 'Cravatte', 'Count', 'Old man', 'Labarre', 'Valjean', 'Marguerite', 'Mme Der',
                 'Isabeau', 'Gervais', 'Tholomyes', 'Listolier', 'Fameuil', 'Blacheville', 'Favourite', 'Dahlia',
                 'Zephine', 'Fantine', 'Mme Thenardier', 'Thenardier', 'Cosette', 'Javert', 'Fauchelevent',
                 'Bamatabois', 'Perpetue', 'Simplice', 'Scaufflaire', 'Woman1', 'Judge', 'Champmathieu', 'Brevet',
                 'Chenildieu', 'Cochepaille', 'Pontmercy', 'Boulatruelle', 'Eponine', 'Anzelma', 'Woman2',
                 'Mother Innocent', 'Gribier', 'Jondrette', 'Mme Burgon', 'Gavroche', 'Gillenormand', 'Magnon',
                 'Mlle Gillenormand', 'Mme Pontmercy', 'Mlle Vaubois', 'Lt Gillenormand', 'Marius', 'Baroness',
                 'Mabeuf', 'Enjolras', 'Combeferre', 'Prouvaire', 'Feuilly', 'Courfeyrac', 'Bahorel', 'Bossuet',
                 'Joly', 'Grantaire', 'MotherPlutarch', 'Gueulemer', 'Babet', 'Claquesous', 'Montparnasse',
                 'Toussaint', 'Child1', 'Child2', 'Brujon', 'Mme Hucheloup']
        x = np.array(
            [0.53,  0.98,  0.41,  0.4,  1.,  0.92,  0.84,  0.74,  0.78, 1.,  0.51,  0.09, -0.,  0.29,  0.37,
             0.41, -0.35, -0.46, -0.42, -0.46, -0.41, -0.37, -0.36, -0.2, -0.06, -0.04, -0.01, -0.02,  0.33,
             0.17, -0.29, -0.1,  0.58,  0.29,  0.29,  0.26, 0.29,  0.37,  0.35,  0.04, -0.01, -0.18, -0.09,
             0.2,  0.51, 0.7, -0.95, -0.7, -0.37, -0.08, -0.18, -0.05,  0.04, -0.12, -0.06, -0.13, -0.24, -0.48,
             -0.25, -0.33, -0.43, -0.39, -0.33, -0.42, -0.31, -0.38, -0.48, -0.74, -0.08, -0.1, -0.02, -0.1,
             0.14, -0.76, -0.75, -0.18, -0.58])
        y = np.array(
            [-0.23, -0.42, -0.14, -0.18, -0.31, -0.52, -0.6, -0.65, -0.38, -0.19,  0.39,  0.03,  0.44, -0.44,
             0.51, -0.36,  0.27,  0.37, 0.4,  0.32,  0.32,  0.36,  0.4,  0.2,  0.07,  0.14, -0.05, 0.06,  0.06,
             0.24, -0.26, -0.1,  0.24, -0.04,  0.17,  0.23, 0.31,  0.21,  0.27, -0.36,  0.69,  0.11,  0.38, -0.09,
             0.05, 0.12,  0.82,  0.44,  0.06, -0.2, -0.4, -0.28, -0.68, -0.79, -0.4, -0.07, -0.51, -0.17, -0.03,
             -0.09, -0.14, -0.04, -0.04, -0.07, -0.06, -0.11, -0.06, -0.35,  0.24,  0.19,  0.22,  0.29, -0.2,
             0.06,  0.14,  0.3, -0.1])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.names = np.array(names)
        graph.position = np.vstack((x, y)).T
        graph.name = 'miserables'
        return graph
    else:
        return adjacency