Beispiel #1
0
def star(n_branches: int = 3,
         metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Star (undirected).

    Parameters
    ----------
    n_branches : int
        Number of branches.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata (positions).

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import star
    >>> adjacency = star()
    >>> adjacency.shape
    (4, 4)
    """
    edges = [(0, i + 1) for i in range(n_branches)]
    adjacency = edgelist2adjacency(edges, undirected=True)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        angles = 2 * np.pi * np.arange(n_branches) / n_branches
        graph.position = np.vstack([np.cos(angles), np.sin(angles)]).T
        return graph
    else:
        return adjacency
Beispiel #2
0
def save(folder: str, data: Union[sparse.csr_matrix, Bunch]):
    """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster
    subsequent loads.

    Parameters
    ----------
    folder : str
        The name to be used for the bundle folder
    data : Union[sparse.csr_matrix, Bunch]
        The data to save

    Example
    -------
    >>> from sknetwork.data import save
    >>> graph = Bunch()
    >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2)
    >>> graph.names = np.array(list('abcdefghij'))
    >>> save('random_data', graph)
    >>> 'random_data' in listdir('.')
    True
    """
    folder = expanduser(folder)
    if exists(folder):
        shutil.rmtree(folder)
    if isinstance(data, sparse.csr_matrix):
        bunch = Bunch()
        if is_square(data):
            bunch.adjacency = data
        else:
            bunch.biadjacency = data
        data = bunch
    if isabs(folder):
        save_to_numpy_bundle(data, folder, '')
    else:
        save_to_numpy_bundle(data, folder, './')
Beispiel #3
0
def cyclic_digraph(n: int = 3,
                   metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Cyclic graph (directed).

    Parameters
    ----------
    n : int
        Number of nodes.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import cyclic_digraph
    >>> adjacency = cyclic_digraph(5)
    >>> adjacency.shape
    (5, 5)
    """
    row = np.arange(n)
    col = np.array(list(np.arange(1, n)) + [0])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)),
                                  shape=(n, n))

    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = cyclic_position(n)
        return graph
    else:
        return adjacency
Beispiel #4
0
def hourglass(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Hourglass graph.

    * Bipartite graph
    * 4 nodes, 4 edges

    Returns
    -------
    biadjacency or graph : Union[sparse.csr_matrix, Bunch]
        Biadjacency matrix or graph.

    Example
    -------
    >>> from sknetwork.data import hourglass
    >>> biadjacency = hourglass()
    >>> biadjacency.shape
    (2, 2)
    """
    biadjacency = sparse.csr_matrix(np.ones((2, 2), dtype=bool))
    if metadata:
        graph = Bunch()
        graph.biadjacency = biadjacency
        return graph
    else:
        return biadjacency
Beispiel #5
0
def watts_strogatz(n: int = 100, degree: int = 6, prob: float = 0.05, metadata: bool = False) \
    -> Union[sparse.csr_matrix, Bunch]:
    """Watts-Strogatz model.

    Parameters
    ----------
    n : int
        Number of nodes.
    degree : int
        Initial degree of nodes.
    prob : prob
        Probability of edge modification.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.
    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import watts_strogatz
    >>> adjacency = watts_strogatz(30, 4, 0.02)
    >>> adjacency.shape
    (30, 30)

    References
    ----------
    Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature.
    """
    edges = np.array([(i, (i + j + 1) % n) for i in range(n)
                      for j in range(degree // 2)])
    row, col = edges[:, 0], edges[:, 1]
    adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)),
                                  shape=(n, n))
    adjacency = sparse.lil_matrix(adjacency + adjacency.T)
    set_reference = set(np.arange(n))
    for i in range(n):
        candidates = list(set_reference - set(adjacency.rows[i]) - {i})
        for j in adjacency.rows[i]:
            if np.random.random() < prob:
                node = np.random.choice(candidates)
                adjacency[i, node] = 1
                adjacency[node, i] = 1
                adjacency[i, j] = 0
                adjacency[j, i] = 0
    adjacency = sparse.csr_matrix(adjacency)
    if metadata:
        t = 2 * pi * np.arange(n).astype(float) / n
        x = np.cos(t)
        y = np.sin(t)
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.array((x, y)).T
        return graph
    else:
        return adjacency
Beispiel #6
0
def load_adjacency_list(
    file: str,
    bipartite: bool = False,
    comment: str = '%#',
    delimiter: str = None,
) -> Bunch:
    """Parse Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets in the form of
    adjacency lists.

    Parameters
    ----------
    file : str
        The path to the dataset in TSV format
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    comment : str
        Set of characters denoting lines to ignore.
    delimiter : str
        delimiter used in the file. None makes a guess

    Returns
    -------
    graph: :class:`Bunch`
    """
    header_len, guess_delimiter, _, _, _, _ = scan_header(file, comment)
    if delimiter is None:
        delimiter = guess_delimiter
    indptr, indices = [0], []
    with open(file, 'r', encoding='utf-8') as f:
        for i in range(header_len):
            f.readline()
        for row in f:
            neighbors = [int(el) for el in row.split(delimiter)]
            indices += neighbors
            indptr.append(indptr[-1] + len(neighbors))
    indices = np.array(indices)
    n_rows = len(indptr) - 1
    min_index = indices.min()
    n_cols = indices.max() + 1 - min_index
    indices -= min_index
    graph = Bunch()
    if not bipartite:
        max_dim = max(n_rows, n_cols)
        new_indptr = np.full(max_dim + 1, indptr[-1])
        new_indptr[:len(indptr)] = indptr
        graph.adjacency = sparse.csr_matrix(
            (np.ones_like(indices, dtype=bool), indices, new_indptr),
            shape=(max_dim, max_dim))
    else:
        indptr = np.array(indptr)
        graph.biadjacency = sparse.csr_matrix(
            (np.ones_like(indices, dtype=bool), indices, indptr),
            shape=(n_rows, n_cols))
    return graph
Beispiel #7
0
def load_netset(dataset: Optional[str] = None,
                data_home: Optional[Union[str, Path]] = None) -> Bunch:
    """Load a dataset from the `NetSet database
    <https://netset.telecom-paris.fr/>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset (all low-case). Examples include 'openflights', 'cinema' and 'wikivitals'.
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage.

    Returns
    -------
    graph : :class:`Bunch`
    """
    graph = Bunch()
    npz_folder = NETSET_URL + '/datasets_npz/'

    if dataset is None:
        print(
            "Please specify the dataset (e.g., 'openflights' or 'wikivitals').\n"
            + f"Complete list available here: <{npz_folder}>")
        return graph
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(npz_folder + dataset + '_npz.tar.gz',
                        data_home / (dataset + '_npz.tar.gz'))
        except HTTPError:
            rmdir(data_path)
            raise ValueError(
                'Invalid dataset: ' + dataset + '.' +
                "\nAvailable datasets include 'openflights' and 'wikivitals'."
                + f"\nSee <{NETSET_URL}>")
        except ConnectionResetError:  # pragma: no cover
            rmdir(data_path)
            raise RuntimeError("Could not reach Netset.")
        with tarfile.open(data_home / (dataset + '_npz.tar.gz'),
                          'r:gz') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home / (dataset + '_npz.tar.gz'))

    files = [file for file in listdir(data_path)]

    for file in files:
        file_components = file.split('.')
        if len(file_components) == 2:
            file_name, file_extension = tuple(file_components)
            if file_extension == 'npz':
                graph[file_name] = sparse.load_npz(data_path / file)
            elif file_extension == 'npy':
                graph[file_name] = np.load(data_path / file)
            elif file_extension == 'p':
                with open(data_path / file, 'rb') as f:
                    graph[file_name] = pickle.load(f)

    return graph
Beispiel #8
0
def load_from_numpy_bundle(bundle_name: str, data_home: Optional[Union[str, Path]] = None):
    """Load a Bunch from a collection of Numpy and Pickle files (inverse function of ``save_to_numpy_bundle``).

    Parameters
    ----------
    bundle_name: str
        The name used for the bundle folder
    data_home: str or :class:`pathlib.Path`
        The folder used for dataset storage

    Returns
    -------
    data: Bunch
        The original data
    """
    data_home = get_data_home(data_home)
    data_path = data_home / bundle_name
    if not data_path.exists():
        raise FileNotFoundError('No bundle at ' + str(data_path))
    else:
        files = listdir(data_path)
        data = Bunch()
        for file in files:
            if len(file.split('.')) == 2:
                file_name, file_extension = file.split('.')
                if file_extension == 'npz':
                    data[file_name] = sparse.load_npz(data_path / file)
                elif file_extension == 'npy':
                    data[file_name] = np.load(data_path / file)
                elif file_extension == 'p':
                    with open(data_path / file, 'rb') as f:
                        data[file_name] = pickle.load(f)
        return data
def load_from_numpy_bundle(bundle_name: str, data_home: Optional[str] = None):
    """
    Loads a Bunch from a collection of Numpy and Pickle files (inverse function of ``save_to_numpy_bundle``).

    Parameters
    ----------
    bundle_name: str
        The name used for the bundle folder
    data_home: str
        The folder used for dataset storage

    Returns
    -------
    data: Bunch
        The original data
    """
    data_path = data_home + bundle_name
    if not exists(data_path):
        raise FileNotFoundError('No bundle at ' + data_path)
    else:
        files = listdir(data_path)
        data = Bunch()
        for file in files:
            file_name, file_extension = file.split('.')
            if file_extension == 'npz':
                data[file_name] = sparse.load_npz(data_path + '/' + file)
            elif file_extension == 'npy':
                data[file_name] = np.load(data_path + '/' + file)
            elif file_extension == 'p':
                data[file_name] = pickle.load(open(data_path + '/' + file, 'rb'))
        return data
Beispiel #10
0
def parse_metadata(file: str, delimiter: str = ': ') -> 'Bunch':
    metadata = Bunch()
    with open(file, 'r', encoding='utf-8') as f:
        for row in f:
            parts = row.split(delimiter)
            key, value = parts[0], ': '.join(parts[1:]).strip('\n')
            metadata[key] = value
    return metadata
Beispiel #11
0
def load_metadata(file: str, delimiter: str = ': ') -> Bunch:
    """Extract metadata from the file."""
    metadata = Bunch()
    with open(file, 'r', encoding='utf-8') as f:
        for row in f:
            parts = row.split(delimiter)
            key, value = parts[0], ': '.join(parts[1:]).strip('\n')
            metadata[key] = value
    return metadata
def painters(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Graph of links between some famous painters on Wikipedia.

    * Directed graph
    * 14 nodes, 50 edges
    * Names of painters

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (names, positions).

    Example
    -------
    >>> from sknetwork.data import painters
    >>> adjacency = painters()
    >>> adjacency.shape
    (14, 14)
    """
    row = np.array([
        0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7,
        7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
        12, 12, 12, 12, 13, 13
    ])
    col = np.array([
        3, 10, 3, 12, 9, 0, 1, 7, 11, 12, 2, 5, 9, 2, 4, 8, 9, 0, 13, 1, 2, 3,
        8, 11, 12, 0, 1, 4, 5, 7, 10, 11, 2, 4, 0, 3, 8, 11, 12, 0, 1, 3, 10,
        12, 1, 3, 4, 7, 6, 8
    ])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)),
                                  shape=(14, 14))

    if metadata:
        names = np.array([
            'Pablo Picasso', 'Claude Monet', 'Michel Angelo', 'Edouard Manet',
            'Peter Paul Rubens', 'Rembrandt', 'Gustav Klimt', 'Edgar Degas',
            'Vincent van Gogh', 'Leonardo da Vinci', 'Henri Matisse',
            'Paul Cezanne', 'Pierre-Auguste Renoir', 'Egon Schiele'
        ])
        x = np.array([
            0.24, -0.47, -0.3, -0.31, -0.08, 0.12, 0.78, -0.36, 0.11, -0.06,
            -0.02, -0.12, -0.24, 0.73
        ])
        y = np.array([
            0.53, 0.19, -0.71, 0.44, -0.48, -0.65, 0.69, -0.11, 0.01, -1.,
            0.49, 0.28, 0.06, 0.27
        ])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.names = names
        graph.position = np.stack((x, y)).T
        graph.name = 'painters'
        return graph
    else:
        return adjacency
def house(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """House graph.

    * Undirected graph
    * 5 nodes, 6 edges

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> adjacency.shape
    (5, 5)

    """
    row = np.array([0, 0, 1, 1, 2, 3])
    col = np.array([1, 4, 2, 4, 3, 4])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)),
                                  shape=(5, 5))
    adjacency = (adjacency + adjacency.T).astype(bool)

    if metadata:
        x = np.array([0, -1, -1, 1, 1])
        y = np.array([2, 1, -1, -1, 1])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.vstack((x, y)).T
        graph.name = 'house'
        return graph
    else:
        return adjacency
Beispiel #14
0
def grid(n1: int = 10,
         n2: int = 10,
         metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Grid (undirected).

    Parameters
    ----------
    n1, n2 : int
        Grid dimension.
    metadata : bool
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import grid
    >>> adjacency = grid(10, 5)
    >>> adjacency.shape
    (50, 50)
    """
    nodes = [(i1, i2) for i1 in range(n1) for i2 in range(n2)]
    edges = [((i1, i2), (i1 + 1, i2)) for i1 in range(n1 - 1)
             for i2 in range(n2)]
    edges += [((i1, i2), (i1, i2 + 1)) for i1 in range(n1)
              for i2 in range(n2 - 1)]
    node_id = {u: i for i, u in enumerate(nodes)}
    edges = list(map(lambda edge: (node_id[edge[0]], node_id[edge[1]]), edges))
    adjacency = edgelist2adjacency(edges, undirected=True)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = np.array(nodes)
        return graph
    else:
        return adjacency
Beispiel #15
0
def save(folder: Union[str, Path], data: Union[sparse.csr_matrix, Bunch]):
    """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster
    subsequent loads. Supported attribute types include sparse matrices, NumPy arrays, strings and Bunch.

    Parameters
    ----------
    folder : str or :class:`pathlib.Path`
        The name to be used for the bundle folder
    data : Union[sparse.csr_matrix, Bunch]
        The data to save

    Example
    -------
    >>> from sknetwork.data import save
    >>> graph = Bunch()
    >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2)
    >>> graph.names = np.array(list('abcdefghij'))
    >>> save('random_data', graph)
    >>> 'random_data' in listdir('.')
    True
    """
    folder = Path(folder)
    folder = folder.expanduser()
    if folder.exists():
        shutil.rmtree(folder)
    if isinstance(data, sparse.csr_matrix):
        bunch = Bunch()
        if is_square(data):
            bunch.adjacency = data
        else:
            bunch.biadjacency = data
        data = bunch
    if folder.is_absolute():
        save_to_numpy_bundle(data, folder, '/')
    else:
        save_to_numpy_bundle(data, folder, '.')
def star_wars(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Bipartite graph connecting some Star Wars villains to the movies in which they appear.

    * Bipartite graph
    * 7 nodes (4 villains, 3 movies), 8 edges
    * Names of villains and movies

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    biadjacency or graph : Union[sparse.csr_matrix, Bunch]
        Biadjacency matrix or graph with metadata (names).

    Example
    -------
    >>> from sknetwork.data import star_wars
    >>> biadjacency = star_wars()
    >>> biadjacency.shape
    (4, 3)
   """
    row = np.array([0, 0, 1, 2, 2, 2, 3, 3])
    col = np.array([0, 2, 0, 0, 1, 2, 1, 2])
    biadjacency = sparse.csr_matrix(
        (np.ones(len(row), dtype=bool), (row, col)), shape=(4, 3))

    if metadata:
        villains = np.array(['Jabba', 'Greedo', 'Vader', 'Boba'])
        movies = np.array(
            ['A New Hope', 'The Empire Strikes Back', 'Return Of The Jedi'])
        graph = Bunch()
        graph.biadjacency = biadjacency
        graph.names = villains
        graph.names_row = villains
        graph.names_col = movies
        graph.name = 'star_wars'
        return graph
    else:
        return biadjacency
Beispiel #17
0
def movie_actor(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Bipartite graph connecting movies to some actors starring in them.

    * Bipartite graph
    * 31 nodes (15 movies, 16 actors), 42 edges
    * 9 labels (rows)
    * Names of movies (rows) and actors (columns)
    * Names of movies production company (rows)

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    biadjacency or graph : Union[sparse.csr_matrix, Bunch]
        Biadjacency matrix or graph with metadata (names).

    Example
    -------
    >>> from sknetwork.data import movie_actor
    >>> biadjacency = movie_actor()
    >>> biadjacency.shape
    (15, 16)
    """
    row = np.array(
        [0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6,
         6, 6, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11,
         12, 12, 12, 13, 13, 14, 14])
    col = np.array(
        [0, 1, 2, 1, 2, 3, 3, 4, 5, 8, 4, 6, 0, 6, 4, 7, 4,
         7, 8, 3, 8, 9, 10, 11, 12, 15, 0, 11, 12, 9, 10, 13, 5, 9, 13,
         1, 9, 15, 12, 14, 11, 14])
    biadjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)), shape=(15, 16))

    if metadata:
        movies = np.array(
            ['Inception', 'The Dark Knight Rises', 'The Big Short', 'Drive', 'The Great Gatsby', 'La La Land',
             'Crazy Stupid Love', 'Vice', 'The Grand Budapest Hotel', 'Aviator', '007 Spectre', 'Inglourious Basterds',
             'Midnight In Paris', 'Murder on the Orient Express', 'Fantastic Beasts 2'])
        actors = np.array(
            ['Leonardo DiCaprio', 'Marion Cotillard', 'Joseph Gordon Lewitt', 'Christian Bale', 'Ryan Gosling',
             'Brad Pitt', 'Carey Mulligan', 'Emma Stone', 'Steve Carell', 'Lea Seydoux', 'Ralph Fiennes', 'Jude Law',
             'Willem Dafoe', 'Christophe Waltz', 'Johnny Depp', 'Owen Wilson'])
        graph = Bunch()
        graph.biadjacency = biadjacency
        graph.names = movies
        graph.names_row = movies
        graph.names_col = actors
        graph.labels = np.array([0, 0, 1, 2, 3, 2, 4, 1, 5, 0, 6, 5, 7, 8, 0])
        graph.labels_name = np.array(['Warner Bros', 'Plan B Entertainment', 'Marc Platt Productions', 'Bazmark Films',
                                      'Carousel Productions', 'Babelsberg Studios', 'MGM', 'Gravier Productions',
                                      'Genre Films'])
        graph.labels_row = graph.labels
        graph.labels_row_name = graph.labels_name
        graph.name = 'movie_actor'
        return graph
    else:
        return biadjacency
Beispiel #18
0
def load_konect(dataset: str, data_home: Optional[Union[str, Path]] = None, auto_numpy_bundle: bool = True,
                verbose: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.cc/networks/>`_.

    Parameters
    ----------
    dataset : str
        The internal name of the dataset as specified on the Konect website (e.g. for the Zachary Karate club dataset,
        the corresponding name is ``'ucidata-zachary'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).
    verbose : bool
        Enable verbosity.

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.

    References
    ----------
    Kunegis, J. (2013, May).
    `Konect: the Koblenz network collection.
    <https://dl.acm.org/doi/abs/10.1145/2487788.2488173>`_
    In Proceedings of the 22nd International Conference on World Wide Web (pp. 1343-1350).
    """
    logger = Log(verbose)
    if dataset == '':
        raise ValueError("Please specify the dataset. "
                         + "\nExamples include 'actor-movie' and 'ego-facebook'."
                         + "\n See 'http://konect.cc/networks/' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        logger.print('Downloading', dataset, 'from Konect...')
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.cc/files/download.tsv.' + dataset + '.tar.bz2',
                        data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'), 'r:bz2') as tar_ref:
                logger.print('Unpacking archive...')
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError('Invalid dataset ' + dataset + '.'
                             + "\nExamples include 'actor-movie' and 'ego-facebook'."
                             + "\n See 'http://konect.cc/networks/' for the full list.")
        except (URLError, ConnectionResetError):  # pragma: no cover
            rmdir(data_path)
            raise RuntimeError("Could not reach Konect.")
        finally:
            if exists(data_home / (dataset + '.tar.bz2')):
                remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        logger.print('Loading from local bundle...')
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]
    logger.print('Parsing files...')
    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data
Beispiel #19
0
 def test_key_error(self):
     bunch = Bunch(a=1, b=2)
     with self.assertRaises(AttributeError):
         # noinspection PyStatementEffect
         bunch.c
Beispiel #20
0
def miserables(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Co-occurrence graph of the characters in the novel Les miserables by Victor Hugo.

    * Undirected graph
    * 77 nodes, 508 edges
    * Names of characters

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (names, positions).

    Example
    -------
    >>> from sknetwork.data import miserables
    >>> adjacency = miserables()
    >>> adjacency.shape
    (77, 77)
    """
    row = np.array(
        [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  3, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
         16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19,
         20, 20, 20, 21, 21, 22, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
         25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
         27, 27, 27, 27, 27, 28, 28, 29, 29, 29, 29, 29, 30, 34, 34, 34, 34, 35, 35, 35, 36, 36, 37, 39,
         39, 41, 41, 41, 41, 41, 41, 41, 41, 41, 46, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
         48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 51, 51, 51, 51, 54, 55, 55, 55, 55, 55, 55, 55, 55,
         55, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59,
         59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64,
         64, 65, 65, 66, 68, 68, 68, 68, 69, 69, 69, 70, 70, 71, 73])
    col = np.array(
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 3, 11, 11, 11, 12, 13, 14,
         15, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 43,
         44, 48, 49, 51, 55, 58, 64, 68, 69, 70, 71, 72, 23, 17, 18, 19, 20,
         21, 22, 23, 26, 55, 18, 19, 20, 21, 22, 23, 19, 20, 21, 22, 23, 20,
         21, 22, 23, 21, 22, 23, 22, 23, 23, 24, 25, 27, 29, 30, 31, 25, 26,
         27, 41, 42, 50, 68, 69, 70, 26, 27, 39, 40, 41, 42, 48, 55, 68, 69,
         70, 71, 75, 27, 43, 49, 51, 54, 55, 72, 28, 29, 31, 33, 43, 48, 58,
         68, 69, 70, 71, 72, 44, 45, 34, 35, 36, 37, 38, 31, 35, 36, 37, 38,
         36, 37, 38, 37, 38, 38, 52, 55, 42, 55, 57, 62, 68, 69, 70, 71, 75,
         47, 48, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 71, 73,
         74, 75, 76, 50, 51, 54, 55, 56, 52, 53, 54, 55, 55, 56, 57, 58, 59,
         61, 62, 63, 64, 65, 58, 59, 61, 62, 63, 64, 65, 67, 59, 60, 61, 62,
         63, 64, 65, 66, 70, 76, 60, 61, 62, 63, 64, 65, 66, 61, 62, 63, 64,
         65, 66, 62, 63, 64, 65, 66, 63, 64, 65, 66, 76, 64, 65, 66, 76, 65,
         66, 76, 66, 76, 76, 69, 70, 71, 75, 70, 71, 75, 71, 75, 75, 74])
    data = np.array(
        [1, 8, 10, 1, 1, 1, 1, 2, 1, 5, 6, 3, 3, 1, 1, 1, 1,
         1, 9, 7, 12, 31, 17, 8, 2, 3, 1, 2, 3, 3, 2, 2, 2, 3,
         1, 1, 2, 2, 19, 4, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 3,
         3, 3, 3, 1, 1, 4, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4,
         3, 3, 3, 5, 4, 4, 4, 4, 4, 2, 1, 5, 1, 1, 2, 13, 4,
         1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 3, 2, 1, 2, 5, 6,
         4, 1, 3, 1, 1, 3, 2, 1, 21, 2, 1, 1, 1, 1, 1, 1, 6,
         1, 2, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 2, 3, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1,
         1, 2, 4, 1, 7, 6, 1, 2, 7, 5, 5, 3, 1, 1, 1, 1, 2,
         2, 1, 1, 1, 9, 1, 12, 1, 1, 1, 2, 6, 1, 1, 1, 7, 5,
         1, 9, 1, 5, 2, 1, 2, 1, 2, 2, 1, 1, 3, 15, 4, 6, 17,
         4, 10, 5, 3, 1, 1, 2, 5, 13, 5, 9, 5, 1, 2, 3, 2, 2,
         2, 1, 6, 3, 6, 5, 1, 6, 12, 5, 2, 1, 4, 5, 1, 1, 7,
         3, 1, 2, 1, 1, 6, 4, 2, 3, 4, 2, 3, 2, 1, 1, 3])
    adjacency = sparse.csr_matrix((data, (row, col)), shape=(77, 77))
    adjacency = adjacency + adjacency.T

    if metadata:
        names = ['Myriel', 'Napoleon', 'Mlle Baptistine', 'Mme Magloire', 'Countess de Lo', 'Geborand',
                 'Champtercier', 'Cravatte', 'Count', 'Old man', 'Labarre', 'Valjean', 'Marguerite', 'Mme Der',
                 'Isabeau', 'Gervais', 'Tholomyes', 'Listolier', 'Fameuil', 'Blacheville', 'Favourite', 'Dahlia',
                 'Zephine', 'Fantine', 'Mme Thenardier', 'Thenardier', 'Cosette', 'Javert', 'Fauchelevent',
                 'Bamatabois', 'Perpetue', 'Simplice', 'Scaufflaire', 'Woman1', 'Judge', 'Champmathieu', 'Brevet',
                 'Chenildieu', 'Cochepaille', 'Pontmercy', 'Boulatruelle', 'Eponine', 'Anzelma', 'Woman2',
                 'Mother Innocent', 'Gribier', 'Jondrette', 'Mme Burgon', 'Gavroche', 'Gillenormand', 'Magnon',
                 'Mlle Gillenormand', 'Mme Pontmercy', 'Mlle Vaubois', 'Lt Gillenormand', 'Marius', 'Baroness',
                 'Mabeuf', 'Enjolras', 'Combeferre', 'Prouvaire', 'Feuilly', 'Courfeyrac', 'Bahorel', 'Bossuet',
                 'Joly', 'Grantaire', 'MotherPlutarch', 'Gueulemer', 'Babet', 'Claquesous', 'Montparnasse',
                 'Toussaint', 'Child1', 'Child2', 'Brujon', 'Mme Hucheloup']
        x = np.array(
            [0.53,  0.98,  0.41,  0.4,  1.,  0.92,  0.84,  0.74,  0.78, 1.,  0.51,  0.09, -0.,  0.29,  0.37,
             0.41, -0.35, -0.46, -0.42, -0.46, -0.41, -0.37, -0.36, -0.2, -0.06, -0.04, -0.01, -0.02,  0.33,
             0.17, -0.29, -0.1,  0.58,  0.29,  0.29,  0.26, 0.29,  0.37,  0.35,  0.04, -0.01, -0.18, -0.09,
             0.2,  0.51, 0.7, -0.95, -0.7, -0.37, -0.08, -0.18, -0.05,  0.04, -0.12, -0.06, -0.13, -0.24, -0.48,
             -0.25, -0.33, -0.43, -0.39, -0.33, -0.42, -0.31, -0.38, -0.48, -0.74, -0.08, -0.1, -0.02, -0.1,
             0.14, -0.76, -0.75, -0.18, -0.58])
        y = np.array(
            [-0.23, -0.42, -0.14, -0.18, -0.31, -0.52, -0.6, -0.65, -0.38, -0.19,  0.39,  0.03,  0.44, -0.44,
             0.51, -0.36,  0.27,  0.37, 0.4,  0.32,  0.32,  0.36,  0.4,  0.2,  0.07,  0.14, -0.05, 0.06,  0.06,
             0.24, -0.26, -0.1,  0.24, -0.04,  0.17,  0.23, 0.31,  0.21,  0.27, -0.36,  0.69,  0.11,  0.38, -0.09,
             0.05, 0.12,  0.82,  0.44,  0.06, -0.2, -0.4, -0.28, -0.68, -0.79, -0.4, -0.07, -0.51, -0.17, -0.03,
             -0.09, -0.14, -0.04, -0.04, -0.07, -0.06, -0.11, -0.06, -0.35,  0.24,  0.19,  0.22,  0.29, -0.2,
             0.06,  0.14,  0.3, -0.1])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.names = np.array(names)
        graph.position = np.vstack((x, y)).T
        graph.name = 'miserables'
        return graph
    else:
        return adjacency
Beispiel #21
0
def watts_strogatz(n: int = 100,
                   degree: int = 6,
                   prob: float = 0.05,
                   seed: Optional[int] = None,
                   metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Watts-Strogatz model.

    Parameters
    ----------
    n :
        Number of nodes.
    degree :
        Initial degree of nodes.
    prob :
        Probability of edge modification.
    seed :
        Seed of the random generator (optional).
    metadata :
        If ``True``, return a `Bunch` object with metadata.
    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (positions).

    Example
    -------
    >>> from sknetwork.data import watts_strogatz
    >>> adjacency = watts_strogatz(30, 4, 0.02)
    >>> adjacency.shape
    (30, 30)

    References
    ----------
    Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature.
    """
    np.random.seed(seed)
    edges = np.array([(i, (i + j + 1) % n) for i in range(n)
                      for j in range(degree // 2)])
    row, col = edges[:, 0], edges[:, 1]
    adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)),
                                  shape=(n, n))
    adjacency = sparse.lil_matrix(adjacency + adjacency.T)
    nodes = np.arange(n)
    for i in range(n):
        neighbors = adjacency.rows[i]
        candidates = list(set(nodes) - set(neighbors) - {i})
        for j in neighbors:
            if np.random.random() < prob:
                node = np.random.choice(candidates)
                adjacency[i, node] = 1
                adjacency[node, i] = 1
                adjacency[i, j] = 0
                adjacency[j, i] = 0
    adjacency = sparse.csr_matrix(adjacency, shape=adjacency.shape)
    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        graph.position = cyclic_position(n)
        return graph
    else:
        return adjacency
Beispiel #22
0
def block_model(sizes: Iterable, p_in: Union[float, list, np.ndarray] = .2, p_out: float = .05,
                random_state: Optional[int] = None, metadata: bool = False) \
                -> Union[sparse.csr_matrix, Bunch]:
    """Stochastic block model.

    Parameters
    ----------
    sizes :
         Block sizes.
    p_in :
        Probability of connection within blocks.
    p_out :
        Probability of connection across blocks.
    random_state :
        Seed of the random generator (optional).
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (labels).

    Example
    -------
    >>> from sknetwork.data import block_model
    >>> sizes = np.array([4, 5])
    >>> adjacency = block_model(sizes)
    >>> adjacency.shape
    (9, 9)

    References
    ----------
    Airoldi, E.,  Blei, D., Feinberg, S., Xing, E. (2007).
    `Mixed membership stochastic blockmodels. <https://arxiv.org/pdf/0705.4485.pdf>`_
    Journal of Machine Learning Research.
    """
    np.random.seed(random_state)
    sizes = np.array(sizes)

    if isinstance(p_in, (np.floating, float)):
        p_in = p_in * np.ones_like(sizes)
    else:
        p_in = np.array(p_in)

    # each edge is considered twice
    p_in = p_in / 2

    matrix = []
    for i, a in enumerate(sizes):
        row = []
        for j, b in enumerate(sizes):
            if j < i:
                row.append(None)
            elif j > i:
                row.append(sparse.random(a, b, p_out, dtype=bool))
            else:
                row.append(sparse.random(a, a, p_in[i], dtype=bool))
        matrix.append(row)
    adjacency = sparse.bmat(matrix)
    adjacency.setdiag(0)
    adjacency = directed2undirected(adjacency.tocsr(), weighted=False)

    if metadata:
        graph = Bunch()
        graph.adjacency = adjacency
        labels = np.repeat(np.arange(len(sizes)), sizes)
        graph.labels = labels
        return graph
    else:
        return adjacency
Beispiel #23
0
def load_konect_dataset(dataset_name: str, data_home: Optional[str] = None, auto_numpy_bundle: bool = True):
    """
    Loads a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset_name: str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home: str
        The folder to be used for dataset storage
    auto_numpy_bundle: bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster subsequent
        access to the dataset (True).

    Returns
    -------
    data: :class:`Bunch`
        An object with the following attributes:

         * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
         * `meta`: a dictionary containing the metadata as specified by Konect
         * any attribute described in an ent.* file

    """
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset_name + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.uni-koblenz.de/downloads/tsv/' + dataset_name + '.tar.bz2',
                        data_home + '/' + dataset_name + '.tar.bz2')
        except HTTPError:
            raise ValueError('Invalid dataset ' + dataset_name)
        with tarfile.open(data_home + '/' + dataset_name + '.tar.bz2', 'r:bz2') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset_name + '.tar.bz2')
    elif exists(data_path + '/' + dataset_name + '_bundle'):
        return load_from_numpy_bundle(dataset_name + '_bundle', data_path)

    data = Bunch()
    files = [file for file in listdir(data_path) if dataset_name in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = parse_header(data_path + file)
        if bipartite:
            data.biadjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0]
        else:
            data.adjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0]

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = parse_metadata(data_path + file)

    attributes = [file for file in files if 'ent.' + dataset_name in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = parse_labels(data_path + file)

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset_name + '_bundle', data_path)

    return data
Beispiel #24
0
def load_netset(dataset: Optional[str] = None,
                data_home: Optional[str] = None) -> Bunch:
    """Load a dataset from the `NetSets database
    <https://graphs.telecom-paristech.fr/>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset (all low-case). Examples include 'openflights', 'cinema' and 'wikivitals'.
    data_home : str
        The folder to be used for dataset storage.

    Returns
    -------
    graph : :class:`Bunch`

    Example
    -------
    >>> from sknetwork.data import load_netset
    >>> graph = load_netset('openflights')
    >>> graph.adjacency.shape
    (3097, 3097)
    """
    graph = Bunch()

    if dataset is None:
        print(
            "Please specify the dataset (e.g., 'openflights' or 'wikivitals').\n"
            +
            "Complete list available here: <https://graphs.telecom-paristech.fr/datasets_npz/>"
        )
        return graph
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                "https://graphs.telecom-paristech.fr/datasets_npz/" + dataset +
                '_npz.tar.gz', data_home + '/' + dataset + '_npz.tar.gz')
        except HTTPError:
            rmdir(data_home + '/' + dataset)
            raise ValueError(
                'Invalid dataset: ' + dataset + '.' +
                "\nAvailable datasets include 'openflights' and 'wikivitals'."
                + "\nSee <https://graphs.telecom-paristech.fr/>")
        with tarfile.open(data_home + '/' + dataset + '_npz.tar.gz',
                          'r:gz') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset + '_npz.tar.gz')

    files = [file for file in listdir(data_path)]

    for file in files:
        file_components = file.split('.')
        if len(file_components) == 2:
            file_name, file_extension = tuple(file_components)
            if file_extension == 'npz':
                graph[file_name] = sparse.load_npz(data_path + '/' + file)
            elif file_extension == 'npy':
                graph[file_name] = np.load(data_path + '/' + file)
            elif file_extension == 'p':
                graph[file_name] = pickle.load(
                    open(data_path + '/' + file, 'rb'))

    return graph
Beispiel #25
0
def load_graphml(file: str,
                 weight_key: str = 'weight',
                 max_string_size: int = 512) -> Bunch:
    """Parse GraphML datasets.

    Hyperedges and nested graphs are not supported.

    Parameters
    ----------
    file: str
        The path to the dataset
    weight_key: str
        The key to be used as a value for edge weights
    max_string_size: int
        The maximum size for string features of the data

    Returns
    -------
    data: :class:`Bunch`
        The dataset in a bunch with the adjacency as a CSR matrix.
    """
    # see http://graphml.graphdrawing.org/primer/graphml-primer.html
    # and http://graphml.graphdrawing.org/specification/dtd.html#top
    tree = ElementTree.parse(file)
    n_nodes = 0
    n_edges = 0
    symmetrize = None
    naming_nodes = True
    default_weight = 1
    weight_type = bool
    weight_id = None
    # indices in the graph tree
    node_indices = []
    edge_indices = []
    data = Bunch()
    graph = None
    file_description = None
    attribute_descriptions = Bunch()
    attribute_descriptions.node = Bunch()
    attribute_descriptions.edge = Bunch()
    keys = {}
    for file_element in tree.getroot():
        if file_element.tag.endswith('graph'):
            graph = file_element
            symmetrize = (graph.attrib['edgedefault'] == 'undirected')
            for index, element in enumerate(graph):
                if element.tag.endswith('node'):
                    node_indices.append(index)
                    n_nodes += 1
                elif element.tag.endswith('edge'):
                    edge_indices.append(index)
                    if 'directed' in element.attrib:
                        if element.attrib['directed'] == 'true':
                            n_edges += 1
                        else:
                            n_edges += 2
                    elif symmetrize:
                        n_edges += 2
                    else:
                        n_edges += 1
            if 'parse.nodeids' in graph.attrib:
                naming_nodes = not (graph.attrib['parse.nodeids']
                                    == 'canonical')
    for file_element in tree.getroot():
        if file_element.tag.endswith('key'):
            attribute_name = file_element.attrib['attr.name']
            attribute_type = java_type_to_python_type(
                file_element.attrib['attr.type'])
            if attribute_name == weight_key:
                weight_type = java_type_to_python_type(
                    file_element.attrib['attr.type'])
                weight_id = file_element.attrib['id']
                for key_element in file_element:
                    if key_element.tag == 'default':
                        default_weight = attribute_type(key_element.text)
            else:
                default_value = None
                if file_element.attrib['for'] == 'node':
                    size = n_nodes
                    if 'node_attribute' not in data:
                        data.node_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.node[
                                attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.node_attribute[attribute_name] = np.full(
                            size, default_value, dtype=local_type)
                    else:
                        data.node_attribute[attribute_name] = np.zeros(
                            size, dtype=local_type)
                elif file_element.attrib['for'] == 'edge':
                    size = n_edges
                    if 'edge_attribute' not in data:
                        data.edge_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.edge[
                                attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.edge_attribute[attribute_name] = np.full(
                            size, default_value, dtype=local_type)
                    else:
                        data.edge_attribute[attribute_name] = np.zeros(
                            size, dtype=local_type)
                keys[file_element.attrib['id']] = [
                    attribute_name, attribute_type
                ]
        elif file_element.tag.endswith('desc'):
            file_description = file_element.text
    if file_description or attribute_descriptions.node or attribute_descriptions.edge:
        data.meta = Bunch()
        if file_description:
            data.meta['description'] = file_description
        if attribute_descriptions.node or attribute_descriptions.edge:
            data.meta['attributes'] = attribute_descriptions
    if graph is not None:
        row = np.zeros(n_edges, dtype=int)
        col = np.zeros(n_edges, dtype=int)
        dat = np.full(n_edges, default_weight, dtype=weight_type)
        data.names = None
        if naming_nodes:
            data.names = np.zeros(n_nodes, dtype='<U512')

        node_map = {}
        # deal with nodes first
        for number, index in enumerate(node_indices):
            node = graph[index]
            if naming_nodes:
                name = node.attrib['id']
                data.names[number] = name
                node_map[name] = number
            for node_attribute in node:
                if node_attribute.tag.endswith('data'):
                    data.node_attribute[keys[node_attribute.attrib['key']][0]][number] = \
                        keys[node_attribute.attrib['key']][1](node_attribute.text)
        # deal with edges
        edge_index = -1
        for index in edge_indices:
            edge_index += 1
            duplicate = False
            edge = graph[index]
            if naming_nodes:
                node1 = node_map[edge.attrib['source']]
                node2 = node_map[edge.attrib['target']]
            else:
                node1 = int(edge.attrib['source'][1:])
                node2 = int(edge.attrib['target'][1:])
            row[edge_index] = node1
            col[edge_index] = node2
            for edge_attribute in edge:
                if edge_attribute.tag.endswith('data'):
                    if edge_attribute.attrib['key'] == weight_id:
                        dat[edge_index] = weight_type(edge_attribute.text)
                    else:
                        data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                            keys[edge_attribute.attrib['key']][1](edge_attribute.text)
            if 'directed' in edge.attrib:
                if edge.attrib['directed'] != 'true':
                    duplicate = True
            elif symmetrize:
                duplicate = True
            if duplicate:
                edge_index += 1
                row[edge_index] = node2
                col[edge_index] = node1
                for edge_attribute in edge:
                    if edge_attribute.tag.endswith('data'):
                        if edge_attribute.attrib['key'] == weight_id:
                            dat[edge_index] = weight_type(edge_attribute.text)
                        else:
                            data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                                keys[edge_attribute.attrib['key']][1](edge_attribute.text)
        data.adjacency = sparse.csr_matrix((dat, (row, col)),
                                           shape=(n_nodes, n_nodes))
        if data.names is None:
            data.pop('names')
        return data
    else:
        raise ValueError(f'No graph defined in {file}.')
Beispiel #26
0
def karate_club(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]:
    """Karate club graph.

    * Undirected graph
    * 34 nodes, 78 edges
    * 2 labels

    Parameters
    ----------
    metadata :
        If ``True``, return a `Bunch` object with metadata.

    Returns
    -------
    adjacency or graph : Union[sparse.csr_matrix, Bunch]
        Adjacency matrix or graph with metadata (labels, positions).

    Example
    -------
    >>> from sknetwork.data import karate_club
    >>> adjacency = karate_club()
    >>> adjacency.shape
    (34, 34)

    References
    ----------
    Zachary's karate club graph
    https://en.wikipedia.org/wiki/Zachary%27s_karate_club
    """
    row = np.array(
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
         3, 4, 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, 14, 15, 15, 18,
         18, 19, 20, 20, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26,
         27, 28, 28, 29, 29, 30, 30, 31, 31, 32])
    col = np.array(
        [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2,
         3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12,
         13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32, 33, 32,
         33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 31, 29, 33,
         33, 31, 33, 32, 33, 32, 33, 32, 33, 33])
    adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)), shape=(34, 34))
    adjacency = sparse.csr_matrix(adjacency + adjacency.T, dtype=bool)

    if metadata:
        labels = np.array(
            [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        x = np.array(
            [0.04,  0.24,  0.01,  0.13,  0.02, -0.08,  0.04,  0.21,  0.08, -0.11, -0.13, -0.28,  0.2,  0.08,
             0.23,  0.06, -0.06,  0.32, 0.15,  0.19,  0.27,  0.39, -0.04, -0.26, -0.51, -0.49, -0.19, -0.28,
             -0.11, -0.17,  0.22, -0.21,  0.03, 0])
        y = np.array(
            [-0.33, -0.15, -0.01, -0.28, -0.64, -0.75, -0.76, -0.25,  0.09, 0.23, -0.62, -0.4, -0.53, -0.07,
             0.55,  0.64, -1., -0.42, 0.6, -0.01,  0.45, -0.34,  0.61,  0.41,  0.14,  0.28,  0.68, 0.21,
             0.12,  0.54,  0.19,  0.09,  0.38,  0.33])
        graph = Bunch()
        graph.adjacency = adjacency
        graph.labels = labels
        graph.position = np.vstack((x, y)).T
        graph.name = 'karate_club'
        return graph
    else:
        return adjacency
Beispiel #27
0
def from_edge_list(row: np.ndarray,
                   col: np.ndarray,
                   data: np.ndarray,
                   directed: bool = False,
                   bipartite: bool = False,
                   reindex: bool = True,
                   named: Optional[bool] = None) -> Bunch:
    """Turn an edge list given as a triplet of NumPy arrays into a :class:`Bunch`.

    Parameters
    ----------
    row : np.ndarray
        The array of sources in the graph.
    col : np.ndarray
        The array of targets in the graph.
    data : np.ndarray
        The array of weights in the graph. Pass an empty array for unweighted graphs.
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    reindex : bool
        If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the
        maximum of those values. Does not work for bipartite graphs.
    named : Optional[bool]
        Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess
        based on the first lines.

    Returns
    -------
    graph: :class:`Bunch`
    """
    reindexed = False
    if named is None:
        named = (row.dtype != int) or (col.dtype != int)
    weighted = bool(len(data))
    n_edges = len(row)
    graph = Bunch()
    if bipartite:
        names_row, row = np.unique(row, return_inverse=True)
        names_col, col = np.unique(col, return_inverse=True)
        if not reindex:
            n_row = names_row.max() + 1
            n_col = names_col.max() + 1
        else:
            n_row = len(names_row)
            n_col = len(names_col)
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        biadjacency = sparse.csr_matrix((data, (row, col)),
                                        shape=(n_row, n_col))
        graph.biadjacency = biadjacency
        if named or reindex:
            graph.names = names_row
            graph.names_row = names_row
            graph.names_col = names_col
    else:
        nodes = np.concatenate((row, col), axis=None)
        names, new_nodes = np.unique(nodes, return_inverse=True)
        if not reindex:
            n_nodes = names.max() + 1
        else:
            n_nodes = len(names)
        if named:
            row = new_nodes[:n_edges]
            col = new_nodes[n_edges:]
        else:
            should_reindex = not (names[0] == 0 and names[-1] == n_nodes - 1)
            if should_reindex and reindex:
                reindexed = True
                row = new_nodes[:n_edges]
                col = new_nodes[n_edges:]
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        adjacency = sparse.csr_matrix((data, (row, col)),
                                      shape=(n_nodes, n_nodes))
        if not directed:
            adjacency = directed2undirected(adjacency, weighted=weighted)
        graph.adjacency = adjacency
        if named or reindexed:
            graph.names = names

    return graph
Beispiel #28
0
def parse_tsv(file: str,
              directed: bool = False,
              bipartite: bool = False,
              weighted: Optional[bool] = None,
              named: Optional[bool] = None,
              comment: str = '%#',
              delimiter: str = None,
              reindex: bool = True) -> Bunch:
    """Parser for Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets.

    Parameters
    ----------
    file : str
        The path to the dataset in TSV format
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    weighted : Optional[bool]
        Retrieves the weights in the third field of the file. None makes a guess based on the first lines.
    named : Optional[bool]
        Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess
        based on the first lines.
    comment : str
        Set of characters denoting lines to ignore.
    delimiter : str
        delimiter used in the file. None makes a guess
    reindex : bool
        If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the
        maximum of those values. Does not work for bipartite graphs.

    Returns
    -------
    graph: :class:`Bunch`
    """
    reindexed = False
    header_len = -1
    possible_delimiters = ['\t', ',', ' ']
    del_count = np.zeros(3, dtype=int)
    lines = []
    row = comment
    with open(file, 'r', encoding='utf-8') as f:
        while row[0] in comment:
            row = f.readline()
            header_len += 1
        for line in range(3):
            for i, poss_del in enumerate(possible_delimiters):
                if poss_del in row:
                    del_count[i] += 1
            lines.append(row.rstrip())
            row = f.readline()
        lines = [line for line in lines if line != '']
        guess_delimiter = possible_delimiters[int(np.argmax(del_count))]
        guess_weighted = bool(
            min([line.count(guess_delimiter) for line in lines]) - 1)
        guess_named = not all([
            all([el.strip().isdigit()
                 for el in line.split(guess_delimiter)][0:2]) for line in lines
        ])
    if weighted is None:
        weighted = guess_weighted
    if named is None:
        named = guess_named
    if delimiter is None:
        delimiter = guess_delimiter

    row, col, data = [], [], []
    with open(file, 'r', encoding='utf-8') as f:
        for i in range(header_len):
            f.readline()
        csv_reader = reader(f, delimiter=delimiter)
        for line in csv_reader:
            if line[0] not in comment:
                if named:
                    row.append(line[0])
                    col.append(line[1])
                else:
                    row.append(int(line[0]))
                    col.append(int(line[1]))
                if weighted:
                    data.append(float(line[2]))
    n_edges = len(row)

    graph = Bunch()
    if bipartite:
        names_row, row = np.unique(row, return_inverse=True)
        names_col, col = np.unique(col, return_inverse=True)
        if not reindex:
            n_row = max(names_row) + 1
            n_col = max(names_col) + 1
        else:
            n_row = len(names_row)
            n_col = len(names_col)
        if not weighted:
            data = np.ones(n_edges, dtype=bool)
        biadjacency = sparse.csr_matrix((data, (row, col)),
                                        shape=(n_row, n_col))
        graph.biadjacency = biadjacency
        if named or reindex:
            graph.names = names_row
            graph.names_row = names_row
            graph.names_col = names_col
    else:
        nodes = np.concatenate((row, col), axis=None)
        names, new_nodes = np.unique(nodes, return_inverse=True)
        if not reindex:
            n_nodes = max(names) + 1
        else:
            n_nodes = len(names)
        if named:
            row = new_nodes[:n_edges]
            col = new_nodes[n_edges:]
        else:
            if not all(names == range(len(names))) and reindex:
                reindexed = True
                row = new_nodes[:n_edges]
                col = new_nodes[n_edges:]
        if not weighted:
            data = np.ones(n_edges, dtype=int)
        adjacency = sparse.csr_matrix((data, (row, col)),
                                      shape=(n_nodes, n_nodes))
        if not directed:
            adjacency += adjacency.T
        graph.adjacency = adjacency
        if named or reindexed:
            graph.names = names

    return graph
Beispiel #29
0
def load_konect(dataset: str,
                data_home: Optional[str] = None,
                auto_numpy_bundle: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home : str
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Example
    -------
    >>> from sknetwork.data import load_konect
    >>> graph = load_konect('dolphins')
    >>> graph.adjacency.shape
    (62, 62)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.
    """
    if dataset == '':
        raise ValueError(
            "Please specify the dataset. " +
            "\nExamples include 'actor-movie' and 'ego-facebook'." +
            "\n See 'http://konect.uni-koblenz.de' for the full list.")
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                'http://konect.uni-koblenz.de/downloads/tsv/' + dataset +
                '.tar.bz2', data_home + '/' + dataset + '.tar.bz2')
            with tarfile.open(data_home + '/' + dataset + '.tar.bz2',
                              'r:bz2') as tar_ref:
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_home + '/' + dataset)
            raise ValueError(
                'Invalid dataset ' + dataset + '.' +
                "\nExamples include 'actor-movie' and 'ego-facebook'." +
                "\n See 'http://konect.uni-koblenz.de' for the full list.")
        finally:
            remove(data_home + '/' + dataset + '.tar.bz2')
    elif exists(data_path + '/' + dataset + '_bundle'):
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = parse_header(data_path + file)
        if bipartite:
            graph = parse_tsv(data_path + file,
                              directed=directed,
                              bipartite=bipartite,
                              weighted=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = parse_tsv(data_path + file,
                              directed=directed,
                              bipartite=bipartite,
                              weighted=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = parse_metadata(data_path + file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = parse_labels(data_path + file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data
Beispiel #30
0
def load_wikilinks_dataset(dataset_name: str, data_home: Optional[str] = None,
                           max_depth: int = 1, full_path: bool = True):
    """
    Loads a dataset from the `WikiLinks database
    <https://graphs.telecom-paristech.fr/Home_page.html#wikilinks-section>`_.

    Parameters
    ----------
    dataset_name: str
        The name of the dataset (all lowcase). Currently, 'wikivitals' and 'wikihumans' are available.
    data_home: str
        The folder to be used for dataset storage
    max_depth: int
        Denotes the maximum depth to use for the categories (if relevant)
    full_path: bool
        Denotes if only the deepest label possible should be returned or if all super categories should
        be considered (if relevant)

    Returns
    -------
    data: :class:`Bunch`
        An object with some of the following attributes (depending on the dataset):

         * `adjacency`: the adjacency matrix of the graph in CSR format
         * `biadjacency`: the biadjacency matrix of the graph in CSR format
         * `feature_names`: the array of the names for the features
         * `names`: the titles of the articles
         * `target_names`: the categories of the articles as specified with `max_depth` and `full_path`
         * `target`: the index for `target_names`

    """
    if data_home is None:
        data_home = get_data_home()
    data_path = data_home + '/' + dataset_name + '/'
    if not exists(data_path):
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve("https://graphs.telecom-paristech.fr/npz_datasets/" + dataset_name + '_npz.tar.gz',
                        data_home + '/' + dataset_name + '_npz.tar.gz')
        except HTTPError:
            raise ValueError('Invalid dataset ' + dataset_name)
        with tarfile.open(data_home + '/' + dataset_name + '_npz.tar.gz', 'r:gz') as tar_ref:
            tar_ref.extractall(data_home)
        remove(data_home + '/' + dataset_name + '_npz.tar.gz')

    data = Bunch()
    files = [file for file in listdir(data_path)]

    if 'adjacency.npz' in files:
        data.adjacency = sparse.load_npz(data_path + '/adjacency.npz')
    if 'biadjacency.npz' in files:
        data.biadjacency = sparse.load_npz(data_path + '/biadjacency.npz')
    if 'names.npy' in files:
        data.names = np.load(data_path + '/names.npy')
    if 'feature_names.npy' in files:
        data.feature_names = np.load(data_path + '/feature_names.npy')
    if 'target_names.npy' in files:
        tmp_target_names = np.load(data_path + '/target_names.npy')
        tags = []
        for tag in tmp_target_names:
            parts = tag.strip().split('.')
            if full_path:
                tags.append(".".join(parts[:min(max_depth, len(parts))]))
            else:
                tags.append(parts[:min(max_depth, len(parts))][-1])
        data.target_names = np.array(tags)
        _, data.target = np.unique(data.target_names, return_inverse=True)

    return data