def load_adjacency_list( file: str, bipartite: bool = False, comment: str = '%#', delimiter: str = None, ) -> Bunch: """Parse Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets in the form of adjacency lists. Parameters ---------- file : str The path to the dataset in TSV format bipartite : bool If ``True``, returns a biadjacency matrix of shape (n1, n2). comment : str Set of characters denoting lines to ignore. delimiter : str delimiter used in the file. None makes a guess Returns ------- graph: :class:`Bunch` """ header_len, guess_delimiter, _, _, _, _ = scan_header(file, comment) if delimiter is None: delimiter = guess_delimiter indptr, indices = [0], [] with open(file, 'r', encoding='utf-8') as f: for i in range(header_len): f.readline() for row in f: neighbors = [int(el) for el in row.split(delimiter)] indices += neighbors indptr.append(indptr[-1] + len(neighbors)) indices = np.array(indices) n_rows = len(indptr) - 1 min_index = np.min(indices) n_cols = np.max(indices) + 1 - min_index indices -= min_index graph = Bunch() if not bipartite: max_dim = max(n_rows, n_cols) new_indptr = np.full(max_dim + 1, indptr[-1]) new_indptr[:len(indptr)] = indptr graph.adjacency = sparse.csr_matrix( (np.ones_like(indices, dtype=int), indices, new_indptr), shape=(max_dim, max_dim)) if max(graph.adjacency.data) == 1: graph.adjacency = graph.adjacency.astype(bool) else: indptr = np.array(indptr) graph.biadjacency = sparse.csr_matrix( (np.ones_like(indices, dtype=int), indices, indptr), shape=(n_rows, n_cols)) if max(graph.biadjacency.data) == 1: graph.biadjacency = graph.biadjacency.astype(bool) return graph
def star(n_branches: int = 3, metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Star (undirected). Parameters ---------- n_branches : int Number of branches. metadata : bool If ``True``, return a `Bunch` object with metadata (positions). Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import star >>> adjacency = star() >>> adjacency.shape (4, 4) """ edges = [(0, i + 1) for i in range(n_branches)] adjacency = edgelist2adjacency(edges, undirected=True) if metadata: graph = Bunch() graph.adjacency = adjacency angles = 2 * np.pi * np.arange(n_branches) / n_branches graph.position = np.vstack([np.cos(angles), np.sin(angles)]).T return graph else: return adjacency
def painters(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Graph of links between some famous painters on Wikipedia. * Directed graph * 14 nodes, 50 edges * Names of painters Parameters ---------- metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (names, positions). Example ------- >>> from sknetwork.data import painters >>> adjacency = painters() >>> adjacency.shape (14, 14) """ row = np.array([ 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13 ]) col = np.array([ 3, 10, 3, 12, 9, 0, 1, 7, 11, 12, 2, 5, 9, 2, 4, 8, 9, 0, 13, 1, 2, 3, 8, 11, 12, 0, 1, 4, 5, 7, 10, 11, 2, 4, 0, 3, 8, 11, 12, 0, 1, 3, 10, 12, 1, 3, 4, 7, 6, 8 ]) adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)), shape=(14, 14)) if metadata: names = np.array([ 'Pablo Picasso', 'Claude Monet', 'Michel Angelo', 'Edouard Manet', 'Peter Paul Rubens', 'Rembrandt', 'Gustav Klimt', 'Edgar Degas', 'Vincent van Gogh', 'Leonardo da Vinci', 'Henri Matisse', 'Paul Cezanne', 'Pierre-Auguste Renoir', 'Egon Schiele' ]) x = np.array([ 0.24, -0.47, -0.3, -0.31, -0.08, 0.12, 0.78, -0.36, 0.11, -0.06, -0.02, -0.12, -0.24, 0.73 ]) y = np.array([ 0.53, 0.19, -0.71, 0.44, -0.48, -0.65, 0.69, -0.11, 0.01, -1., 0.49, 0.28, 0.06, 0.27 ]) graph = Bunch() graph.adjacency = adjacency graph.names = names graph.position = np.stack((x, y)).T graph.name = 'painters' return graph else: return adjacency
def cyclic_digraph(n: int = 3, metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Cyclic graph (directed). Parameters ---------- n : int Number of nodes. metadata : bool If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import cyclic_digraph >>> adjacency = cyclic_digraph(5) >>> adjacency.shape (5, 5) """ row = np.arange(n) col = np.array(list(np.arange(1, n)) + [0]) adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)), shape=(n, n)) if metadata: graph = Bunch() graph.adjacency = adjacency graph.position = cyclic_position(n) return graph else: return adjacency
def save(folder: str, data: Union[sparse.csr_matrix, Bunch]): """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster subsequent loads. Parameters ---------- folder : str The name to be used for the bundle folder data : Union[sparse.csr_matrix, Bunch] The data to save Example ------- >>> from sknetwork.data import save >>> graph = Bunch() >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2) >>> graph.names = np.array(list('abcdefghij')) >>> save('random_data', graph) >>> 'random_data' in listdir('.') True """ folder = expanduser(folder) if exists(folder): shutil.rmtree(folder) if isinstance(data, sparse.csr_matrix): bunch = Bunch() if is_square(data): bunch.adjacency = data else: bunch.biadjacency = data data = bunch if isabs(folder): save_to_numpy_bundle(data, folder, '') else: save_to_numpy_bundle(data, folder, './')
def watts_strogatz(n: int = 100, degree: int = 6, prob: float = 0.05, metadata: bool = False) \ -> Union[sparse.csr_matrix, Bunch]: """Watts-Strogatz model. Parameters ---------- n : int Number of nodes. degree : int Initial degree of nodes. prob : prob Probability of edge modification. metadata : bool If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import watts_strogatz >>> adjacency = watts_strogatz(30, 4, 0.02) >>> adjacency.shape (30, 30) References ---------- Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature. """ edges = np.array([(i, (i + j + 1) % n) for i in range(n) for j in range(degree // 2)]) row, col = edges[:, 0], edges[:, 1] adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)), shape=(n, n)) adjacency = sparse.lil_matrix(adjacency + adjacency.T) set_reference = set(np.arange(n)) for i in range(n): candidates = list(set_reference - set(adjacency.rows[i]) - {i}) for j in adjacency.rows[i]: if np.random.random() < prob: node = np.random.choice(candidates) adjacency[i, node] = 1 adjacency[node, i] = 1 adjacency[i, j] = 0 adjacency[j, i] = 0 adjacency = sparse.csr_matrix(adjacency) if metadata: t = 2 * pi * np.arange(n).astype(float) / n x = np.cos(t) y = np.sin(t) graph = Bunch() graph.adjacency = adjacency graph.position = np.array((x, y)).T return graph else: return adjacency
def house(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """House graph. * Undirected graph * 5 nodes, 6 edges Parameters ---------- metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import house >>> adjacency = house() >>> adjacency.shape (5, 5) """ row = np.array([0, 0, 1, 1, 2, 3]) col = np.array([1, 4, 2, 4, 3, 4]) adjacency = sparse.csr_matrix((np.ones(len(row), dtype=int), (row, col)), shape=(5, 5)) adjacency = (adjacency + adjacency.T).astype(bool) if metadata: x = np.array([0, -1, -1, 1, 1]) y = np.array([2, 1, -1, -1, 1]) graph = Bunch() graph.adjacency = adjacency graph.position = np.vstack((x, y)).T graph.name = 'house' return graph else: return adjacency
def grid(n1: int = 10, n2: int = 10, metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Grid (undirected). Parameters ---------- n1, n2 : int Grid dimension. metadata : bool If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import grid >>> adjacency = grid(10, 5) >>> adjacency.shape (50, 50) """ nodes = [(i1, i2) for i1 in range(n1) for i2 in range(n2)] edges = [((i1, i2), (i1 + 1, i2)) for i1 in range(n1 - 1) for i2 in range(n2)] edges += [((i1, i2), (i1, i2 + 1)) for i1 in range(n1) for i2 in range(n2 - 1)] node_id = {u: i for i, u in enumerate(nodes)} edges = list(map(lambda edge: (node_id[edge[0]], node_id[edge[1]]), edges)) adjacency = edgelist2adjacency(edges, undirected=True) if metadata: graph = Bunch() graph.adjacency = adjacency graph.position = np.array(nodes) return graph else: return adjacency
def save(folder: Union[str, Path], data: Union[sparse.csr_matrix, Bunch]): """Save a Bunch or a CSR matrix in the current directory to a collection of Numpy and Pickle files for faster subsequent loads. Supported attribute types include sparse matrices, NumPy arrays, strings and Bunch. Parameters ---------- folder : str or :class:`pathlib.Path` The name to be used for the bundle folder data : Union[sparse.csr_matrix, Bunch] The data to save Example ------- >>> from sknetwork.data import save >>> graph = Bunch() >>> graph.adjacency = sparse.csr_matrix(np.random.random((10, 10)) < 0.2) >>> graph.names = np.array(list('abcdefghij')) >>> save('random_data', graph) >>> 'random_data' in listdir('.') True """ folder = Path(folder) folder = folder.expanduser() if folder.exists(): shutil.rmtree(folder) if isinstance(data, sparse.csr_matrix): bunch = Bunch() if is_square(data): bunch.adjacency = data else: bunch.biadjacency = data data = bunch if folder.is_absolute(): save_to_numpy_bundle(data, folder, '/') else: save_to_numpy_bundle(data, folder, '.')
def load_konect_dataset(dataset_name: str, data_home: Optional[str] = None, auto_numpy_bundle: bool = True): """ Loads a dataset from the `Konect database <http://konect.uni-koblenz.de>`_. Parameters ---------- dataset_name: str The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding name is ``'actor-movie'``). data_home: str The folder to be used for dataset storage auto_numpy_bundle: bool Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster subsequent access to the dataset (True). Returns ------- data: :class:`Bunch` An object with the following attributes: * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset * `meta`: a dictionary containing the metadata as specified by Konect * any attribute described in an ent.* file """ if data_home is None: data_home = get_data_home() data_path = data_home + '/' + dataset_name + '/' if not exists(data_path): makedirs(data_path, exist_ok=True) try: urlretrieve('http://konect.uni-koblenz.de/downloads/tsv/' + dataset_name + '.tar.bz2', data_home + '/' + dataset_name + '.tar.bz2') except HTTPError: raise ValueError('Invalid dataset ' + dataset_name) with tarfile.open(data_home + '/' + dataset_name + '.tar.bz2', 'r:bz2') as tar_ref: tar_ref.extractall(data_home) remove(data_home + '/' + dataset_name + '.tar.bz2') elif exists(data_path + '/' + dataset_name + '_bundle'): return load_from_numpy_bundle(dataset_name + '_bundle', data_path) data = Bunch() files = [file for file in listdir(data_path) if dataset_name in file] matrix = [file for file in files if 'out.' in file] if matrix: file = matrix[0] directed, bipartite, weighted = parse_header(data_path + file) if bipartite: data.biadjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0] else: data.adjacency = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted)[0] metadata = [file for file in files if 'meta.' in file] if metadata: file = metadata[0] data.meta = parse_metadata(data_path + file) attributes = [file for file in files if 'ent.' + dataset_name in file] if attributes: for file in attributes: attribute_name = file.split('.')[-1] data[attribute_name] = parse_labels(data_path + file) if auto_numpy_bundle: save_to_numpy_bundle(data, dataset_name + '_bundle', data_path) return data
def watts_strogatz(n: int = 100, degree: int = 6, prob: float = 0.05, seed: Optional[int] = None, metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Watts-Strogatz model. Parameters ---------- n : Number of nodes. degree : Initial degree of nodes. prob : Probability of edge modification. seed : Seed of the random generator (optional). metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (positions). Example ------- >>> from sknetwork.data import watts_strogatz >>> adjacency = watts_strogatz(30, 4, 0.02) >>> adjacency.shape (30, 30) References ---------- Watts, D., Strogatz, S. (1998). Collective dynamics of small-world networks, Nature. """ np.random.seed(seed) edges = np.array([(i, (i + j + 1) % n) for i in range(n) for j in range(degree // 2)]) row, col = edges[:, 0], edges[:, 1] adjacency = sparse.coo_matrix((np.ones_like(row, int), (row, col)), shape=(n, n)) adjacency = sparse.lil_matrix(adjacency + adjacency.T) nodes = np.arange(n) for i in range(n): neighbors = adjacency.rows[i] candidates = list(set(nodes) - set(neighbors) - {i}) for j in neighbors: if np.random.random() < prob: node = np.random.choice(candidates) adjacency[i, node] = 1 adjacency[node, i] = 1 adjacency[i, j] = 0 adjacency[j, i] = 0 adjacency = sparse.csr_matrix(adjacency, shape=adjacency.shape) if metadata: graph = Bunch() graph.adjacency = adjacency graph.position = cyclic_position(n) return graph else: return adjacency
def load_konect(dataset: str, data_home: Optional[Union[str, Path]] = None, auto_numpy_bundle: bool = True, verbose: bool = True) -> Bunch: """Load a dataset from the `Konect database <http://konect.cc/networks/>`_. Parameters ---------- dataset : str The internal name of the dataset as specified on the Konect website (e.g. for the Zachary Karate club dataset, the corresponding name is ``'ucidata-zachary'``). data_home : str or :class:`pathlib.Path` The folder to be used for dataset storage auto_numpy_bundle : bool Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster subsequent access to the dataset (True). verbose : bool Enable verbosity. Returns ------- graph : :class:`Bunch` An object with the following attributes: * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset * `meta`: a dictionary containing the metadata as specified by Konect * each attribute specified by Konect (ent.* file) Notes ----- An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case, `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function. References ---------- Kunegis, J. (2013, May). `Konect: the Koblenz network collection. <https://dl.acm.org/doi/abs/10.1145/2487788.2488173>`_ In Proceedings of the 22nd International Conference on World Wide Web (pp. 1343-1350). """ logger = Log(verbose) if dataset == '': raise ValueError("Please specify the dataset. " + "\nExamples include 'actor-movie' and 'ego-facebook'." + "\n See 'http://konect.cc/networks/' for the full list.") data_home = get_data_home(data_home) data_path = data_home / dataset if not data_path.exists(): logger.print('Downloading', dataset, 'from Konect...') makedirs(data_path, exist_ok=True) try: urlretrieve('http://konect.cc/files/download.tsv.' + dataset + '.tar.bz2', data_home / (dataset + '.tar.bz2')) with tarfile.open(data_home / (dataset + '.tar.bz2'), 'r:bz2') as tar_ref: logger.print('Unpacking archive...') tar_ref.extractall(data_home) except (HTTPError, tarfile.ReadError): rmdir(data_path) raise ValueError('Invalid dataset ' + dataset + '.' + "\nExamples include 'actor-movie' and 'ego-facebook'." + "\n See 'http://konect.cc/networks/' for the full list.") except (URLError, ConnectionResetError): # pragma: no cover rmdir(data_path) raise RuntimeError("Could not reach Konect.") finally: if exists(data_home / (dataset + '.tar.bz2')): remove(data_home / (dataset + '.tar.bz2')) elif exists(data_path / (dataset + '_bundle')): logger.print('Loading from local bundle...') return load_from_numpy_bundle(dataset + '_bundle', data_path) data = Bunch() files = [file for file in listdir(data_path) if dataset in file] logger.print('Parsing files...') matrix = [file for file in files if 'out.' in file] if matrix: file = matrix[0] directed, bipartite, weighted = load_header(data_path / file) if bipartite: graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted) data.biadjacency = graph.biadjacency else: graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted) data.adjacency = graph.adjacency metadata = [file for file in files if 'meta.' in file] if metadata: file = metadata[0] data.meta = load_metadata(data_path / file) attributes = [file for file in files if 'ent.' + dataset in file] if attributes: for file in attributes: attribute_name = file.split('.')[-1] data[attribute_name] = load_labels(data_path / file) if hasattr(data, 'meta'): if hasattr(data.meta, 'name'): pass else: data.meta.name = dataset else: data.meta = Bunch() data.meta.name = dataset if auto_numpy_bundle: save_to_numpy_bundle(data, dataset + '_bundle', data_path) return data
def block_model(sizes: np.ndarray, p_in: Union[float, list, np.ndarray] = .2, p_out: float = .05, seed: Optional[int] = None, metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Stochastic block model. Parameters ---------- sizes : Block sizes. p_in : Probability of connection within blocks. p_out : Probability of connection across blocks (must be less than **p_in**). seed : Optional[int] Random seed. metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (labels). Example ------- >>> from sknetwork.data import block_model >>> sizes = np.array([4, 5]) >>> adjacency = block_model(sizes) >>> adjacency.shape (9, 9) References ---------- Airoldi, E., Blei, D., Feinberg, S., Xing, E. (2007). `Mixed membership stochastic blockmodels. <https://arxiv.org/abs/0803.0476>`_ Journal of Machine Learning Research. """ np.random.seed(seed) sizes = np.array(sizes) if type(p_in) != np.ndarray: p_in = p_in * np.ones_like(sizes) if p_in.min() < p_out: raise ValueError( 'The probability of connection across blocks p_out must be less that the probability of ' 'connection within a block p_in.') # each edge is considered twice p_in = p_in / 2 p_out = p_out / 2 p_diff = p_in - p_out blocks_in = [(sparse.random(s, s, p_diff[k]) > 0) for k, s in enumerate(sizes)] adjacency_in = sparse.block_diag(blocks_in) n = sizes.sum() adjacency_out = sparse.random(n, n, p_out) > 0 adjacency = sparse.lil_matrix(adjacency_in + adjacency_out) adjacency.setdiag(0) adjacency = adjacency + adjacency.T adjacency = sparse.csr_matrix(adjacency).astype(int) if metadata: graph = Bunch() graph.adjacency = adjacency labels = np.repeat(np.arange(len(sizes)), sizes) graph.labels = labels return graph else: return adjacency
def load_netset(dataset: str, data_home: Optional[str] = None) -> Bunch: """Load a dataset from the `NetSets database <https://graphs.telecom-paristech.fr/>`_. Parameters ---------- dataset : str The name of the dataset (all low-case). Examples include 'openflights', 'cinema' and 'wikivitals'. data_home : str The folder to be used for dataset storage. Returns ------- graph : :class:`Bunch` Example ------- >>> from sknetwork.data import load_netset >>> graph = load_netset('openflights') >>> graph.adjacency.shape (3097, 3097) """ if dataset == '': raise ValueError( "Please specify the dataset (e.g., 'openflights' or 'wikivitals')." ) if data_home is None: data_home = get_data_home() data_path = data_home + '/' + dataset + '/' if not exists(data_path): makedirs(data_path, exist_ok=True) try: urlretrieve( "https://graphs.telecom-paristech.fr/datasets_npz/" + dataset + '_npz.tar.gz', data_home + '/' + dataset + '_npz.tar.gz') except HTTPError: rmdir(data_home + '/' + dataset) raise ValueError( 'Invalid dataset ' + dataset + '.' + "\nAvailable datasets include 'openflights' and 'wikivitals'." + "\nSee <https://graphs.telecom-paristech.fr/>") with tarfile.open(data_home + '/' + dataset + '_npz.tar.gz', 'r:gz') as tar_ref: tar_ref.extractall(data_home) remove(data_home + '/' + dataset + '_npz.tar.gz') graph = Bunch() files = [file for file in listdir(data_path)] if 'adjacency.npz' in files: graph.adjacency = sparse.load_npz(data_path + '/adjacency.npz') if 'biadjacency.npz' in files: graph.biadjacency = sparse.load_npz(data_path + '/biadjacency.npz') if 'names.npy' in files: graph.names = np.load(data_path + '/names.npy') if 'names_row.npy' in files: graph.names_row = np.load(data_path + '/names_row.npy') if 'names_col.npy' in files: graph.names_col = np.load(data_path + '/names_col.npy') if 'labels.npy' in files: graph.labels = np.load(data_path + '/labels.npy') if 'labels_row.npy' in files: graph.labels_row = np.load(data_path + '/labels_row.npy') if 'labels_col.npy' in files: graph.labels_col = np.load(data_path + '/labels_col.npy') if 'labels_hierarchy.npy' in files: graph.labels_hierarchy = np.load(data_path + '/labels_hierarchy.npy') if 'names_labels.npy' in files: graph.names_labels = np.load(data_path + '/names_labels.npy') if 'names_labels_hierarchy.npy' in files: graph.names_labels_hierarchy = np.load(data_path + '/names_labels_hierarchy.npy') if 'position.npy' in files: graph.position = np.load(data_path + '/position.npy') graph.meta = Bunch() graph.meta.name = dataset return graph
def load_graphml(file: str, weight_key: str = 'weight', max_string_size: int = 512) -> Bunch: """Parse GraphML datasets. Hyperedges and nested graphs are not supported. Parameters ---------- file: str The path to the dataset weight_key: str The key to be used as a value for edge weights max_string_size: int The maximum size for string features of the data Returns ------- data: :class:`Bunch` The dataset in a bunch with the adjacency as a CSR matrix. """ # see http://graphml.graphdrawing.org/primer/graphml-primer.html # and http://graphml.graphdrawing.org/specification/dtd.html#top tree = ElementTree.parse(file) n_nodes = 0 n_edges = 0 symmetrize = None naming_nodes = True default_weight = 1 weight_type = bool weight_id = None # indices in the graph tree node_indices = [] edge_indices = [] data = Bunch() graph = None file_description = None attribute_descriptions = Bunch() attribute_descriptions.node = Bunch() attribute_descriptions.edge = Bunch() keys = {} for file_element in tree.getroot(): if file_element.tag.endswith('graph'): graph = file_element symmetrize = (graph.attrib['edgedefault'] == 'undirected') for index, element in enumerate(graph): if element.tag.endswith('node'): node_indices.append(index) n_nodes += 1 elif element.tag.endswith('edge'): edge_indices.append(index) if 'directed' in element.attrib: if element.attrib['directed'] == 'true': n_edges += 1 else: n_edges += 2 elif symmetrize: n_edges += 2 else: n_edges += 1 if 'parse.nodeids' in graph.attrib: naming_nodes = not (graph.attrib['parse.nodeids'] == 'canonical') for file_element in tree.getroot(): if file_element.tag.endswith('key'): attribute_name = file_element.attrib['attr.name'] attribute_type = java_type_to_python_type( file_element.attrib['attr.type']) if attribute_name == weight_key: weight_type = java_type_to_python_type( file_element.attrib['attr.type']) weight_id = file_element.attrib['id'] for key_element in file_element: if key_element.tag == 'default': default_weight = attribute_type(key_element.text) else: default_value = None if file_element.attrib['for'] == 'node': size = n_nodes if 'node_attribute' not in data: data.node_attribute = Bunch() for key_element in file_element: if key_element.tag.endswith('desc'): attribute_descriptions.node[ attribute_name] = key_element.text elif key_element.tag.endswith('default'): default_value = attribute_type(key_element.text) if attribute_type == str: local_type = '<U' + str(max_string_size) else: local_type = attribute_type if default_value: data.node_attribute[attribute_name] = np.full( size, default_value, dtype=local_type) else: data.node_attribute[attribute_name] = np.zeros( size, dtype=local_type) elif file_element.attrib['for'] == 'edge': size = n_edges if 'edge_attribute' not in data: data.edge_attribute = Bunch() for key_element in file_element: if key_element.tag.endswith('desc'): attribute_descriptions.edge[ attribute_name] = key_element.text elif key_element.tag.endswith('default'): default_value = attribute_type(key_element.text) if attribute_type == str: local_type = '<U' + str(max_string_size) else: local_type = attribute_type if default_value: data.edge_attribute[attribute_name] = np.full( size, default_value, dtype=local_type) else: data.edge_attribute[attribute_name] = np.zeros( size, dtype=local_type) keys[file_element.attrib['id']] = [ attribute_name, attribute_type ] elif file_element.tag.endswith('desc'): file_description = file_element.text if file_description or attribute_descriptions.node or attribute_descriptions.edge: data.meta = Bunch() if file_description: data.meta['description'] = file_description if attribute_descriptions.node or attribute_descriptions.edge: data.meta['attributes'] = attribute_descriptions if graph is not None: row = np.zeros(n_edges, dtype=int) col = np.zeros(n_edges, dtype=int) dat = np.full(n_edges, default_weight, dtype=weight_type) data.names = None if naming_nodes: data.names = np.zeros(n_nodes, dtype='<U512') node_map = {} # deal with nodes first for number, index in enumerate(node_indices): node = graph[index] if naming_nodes: name = node.attrib['id'] data.names[number] = name node_map[name] = number for node_attribute in node: if node_attribute.tag.endswith('data'): data.node_attribute[keys[node_attribute.attrib['key']][0]][number] = \ keys[node_attribute.attrib['key']][1](node_attribute.text) # deal with edges edge_index = -1 for index in edge_indices: edge_index += 1 duplicate = False edge = graph[index] if naming_nodes: node1 = node_map[edge.attrib['source']] node2 = node_map[edge.attrib['target']] else: node1 = int(edge.attrib['source'][1:]) node2 = int(edge.attrib['target'][1:]) row[edge_index] = node1 col[edge_index] = node2 for edge_attribute in edge: if edge_attribute.tag.endswith('data'): if edge_attribute.attrib['key'] == weight_id: dat[edge_index] = weight_type(edge_attribute.text) else: data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \ keys[edge_attribute.attrib['key']][1](edge_attribute.text) if 'directed' in edge.attrib: if edge.attrib['directed'] != 'true': duplicate = True elif symmetrize: duplicate = True if duplicate: edge_index += 1 row[edge_index] = node2 col[edge_index] = node1 for edge_attribute in edge: if edge_attribute.tag.endswith('data'): if edge_attribute.attrib['key'] == weight_id: dat[edge_index] = weight_type(edge_attribute.text) else: data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \ keys[edge_attribute.attrib['key']][1](edge_attribute.text) data.adjacency = sparse.csr_matrix((dat, (row, col)), shape=(n_nodes, n_nodes)) if data.names is None: data.pop('names') return data else: raise ValueError(f'No graph defined in {file}.')
def block_model(sizes: Iterable, p_in: Union[float, list, np.ndarray] = .2, p_out: float = .05, random_state: Optional[int] = None, metadata: bool = False) \ -> Union[sparse.csr_matrix, Bunch]: """Stochastic block model. Parameters ---------- sizes : Block sizes. p_in : Probability of connection within blocks. p_out : Probability of connection across blocks. random_state : Seed of the random generator (optional). metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (labels). Example ------- >>> from sknetwork.data import block_model >>> sizes = np.array([4, 5]) >>> adjacency = block_model(sizes) >>> adjacency.shape (9, 9) References ---------- Airoldi, E., Blei, D., Feinberg, S., Xing, E. (2007). `Mixed membership stochastic blockmodels. <https://arxiv.org/pdf/0705.4485.pdf>`_ Journal of Machine Learning Research. """ np.random.seed(random_state) sizes = np.array(sizes) if isinstance(p_in, (np.floating, float)): p_in = p_in * np.ones_like(sizes) else: p_in = np.array(p_in) # each edge is considered twice p_in = p_in / 2 matrix = [] for i, a in enumerate(sizes): row = [] for j, b in enumerate(sizes): if j < i: row.append(None) elif j > i: row.append(sparse.random(a, b, p_out, dtype=bool)) else: row.append(sparse.random(a, a, p_in[i], dtype=bool)) matrix.append(row) adjacency = sparse.bmat(matrix) adjacency.setdiag(0) adjacency = directed2undirected(adjacency.tocsr(), weighted=False) if metadata: graph = Bunch() graph.adjacency = adjacency labels = np.repeat(np.arange(len(sizes)), sizes) graph.labels = labels return graph else: return adjacency
def parse_tsv(file: str, directed: bool = False, bipartite: bool = False, weighted: Optional[bool] = None, named: Optional[bool] = None, comment: str = '%#', delimiter: str = None, reindex: bool = True) -> Bunch: """Parser for Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets. Parameters ---------- file : str The path to the dataset in TSV format directed : bool If ``True``, considers the graph as directed. bipartite : bool If ``True``, returns a biadjacency matrix of shape (n1, n2). weighted : Optional[bool] Retrieves the weights in the third field of the file. None makes a guess based on the first lines. named : Optional[bool] Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess based on the first lines. comment : str Set of characters denoting lines to ignore. delimiter : str delimiter used in the file. None makes a guess reindex : bool If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the maximum of those values. Does not work for bipartite graphs. Returns ------- graph: :class:`Bunch` """ reindexed = False header_len = -1 possible_delimiters = ['\t', ',', ' '] del_count = np.zeros(3, dtype=int) lines = [] row = comment with open(file, 'r', encoding='utf-8') as f: while row[0] in comment: row = f.readline() header_len += 1 for line in range(3): for i, poss_del in enumerate(possible_delimiters): if poss_del in row: del_count[i] += 1 lines.append(row.rstrip()) row = f.readline() lines = [line for line in lines if line != ''] guess_delimiter = possible_delimiters[int(np.argmax(del_count))] guess_weighted = bool( min([line.count(guess_delimiter) for line in lines]) - 1) guess_named = not all([ all([el.strip().isdigit() for el in line.split(guess_delimiter)][0:2]) for line in lines ]) if weighted is None: weighted = guess_weighted if named is None: named = guess_named if delimiter is None: delimiter = guess_delimiter row, col, data = [], [], [] with open(file, 'r', encoding='utf-8') as f: for i in range(header_len): f.readline() csv_reader = reader(f, delimiter=delimiter) for line in csv_reader: if line[0] not in comment: if named: row.append(line[0]) col.append(line[1]) else: row.append(int(line[0])) col.append(int(line[1])) if weighted: data.append(float(line[2])) n_edges = len(row) graph = Bunch() if bipartite: names_row, row = np.unique(row, return_inverse=True) names_col, col = np.unique(col, return_inverse=True) if not reindex: n_row = max(names_row) + 1 n_col = max(names_col) + 1 else: n_row = len(names_row) n_col = len(names_col) if not weighted: data = np.ones(n_edges, dtype=bool) biadjacency = sparse.csr_matrix((data, (row, col)), shape=(n_row, n_col)) graph.biadjacency = biadjacency if named or reindex: graph.names = names_row graph.names_row = names_row graph.names_col = names_col else: nodes = np.concatenate((row, col), axis=None) names, new_nodes = np.unique(nodes, return_inverse=True) if not reindex: n_nodes = max(names) + 1 else: n_nodes = len(names) if named: row = new_nodes[:n_edges] col = new_nodes[n_edges:] else: if not all(names == range(len(names))) and reindex: reindexed = True row = new_nodes[:n_edges] col = new_nodes[n_edges:] if not weighted: data = np.ones(n_edges, dtype=int) adjacency = sparse.csr_matrix((data, (row, col)), shape=(n_nodes, n_nodes)) if not directed: adjacency += adjacency.T graph.adjacency = adjacency if named or reindexed: graph.names = names return graph
def from_edge_list(row: np.ndarray, col: np.ndarray, data: np.ndarray, directed: bool = False, bipartite: bool = False, reindex: bool = True, named: Optional[bool] = None) -> Bunch: """Turn an edge list given as a triplet of NumPy arrays into a :class:`Bunch`. Parameters ---------- row : np.ndarray The array of sources in the graph. col : np.ndarray The array of targets in the graph. data : np.ndarray The array of weights in the graph. Pass an empty array for unweighted graphs. directed : bool If ``True``, considers the graph as directed. bipartite : bool If ``True``, returns a biadjacency matrix of shape (n1, n2). reindex : bool If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the maximum of those values. Does not work for bipartite graphs. named : Optional[bool] Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess based on the first lines. Returns ------- graph: :class:`Bunch` """ reindexed = False if named is None: named = (row.dtype != int) or (col.dtype != int) weighted = bool(len(data)) n_edges = len(row) graph = Bunch() if bipartite: names_row, row = np.unique(row, return_inverse=True) names_col, col = np.unique(col, return_inverse=True) if not reindex: n_row = names_row.max() + 1 n_col = names_col.max() + 1 else: n_row = len(names_row) n_col = len(names_col) if not weighted: data = np.ones(n_edges, dtype=bool) biadjacency = sparse.csr_matrix((data, (row, col)), shape=(n_row, n_col)) graph.biadjacency = biadjacency if named or reindex: graph.names = names_row graph.names_row = names_row graph.names_col = names_col else: nodes = np.concatenate((row, col), axis=None) names, new_nodes = np.unique(nodes, return_inverse=True) if not reindex: n_nodes = names.max() + 1 else: n_nodes = len(names) if named: row = new_nodes[:n_edges] col = new_nodes[n_edges:] else: should_reindex = not (names[0] == 0 and names[-1] == n_nodes - 1) if should_reindex and reindex: reindexed = True row = new_nodes[:n_edges] col = new_nodes[n_edges:] if not weighted: data = np.ones(n_edges, dtype=bool) adjacency = sparse.csr_matrix((data, (row, col)), shape=(n_nodes, n_nodes)) if not directed: adjacency = directed2undirected(adjacency, weighted=weighted) graph.adjacency = adjacency if named or reindexed: graph.names = names return graph
def load_konect(dataset: str, data_home: Optional[str] = None, auto_numpy_bundle: bool = True) -> Bunch: """Load a dataset from the `Konect database <http://konect.uni-koblenz.de>`_. Parameters ---------- dataset : str The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding name is ``'actor-movie'``). data_home : str The folder to be used for dataset storage auto_numpy_bundle : bool Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster subsequent access to the dataset (True). Returns ------- graph : :class:`Bunch` An object with the following attributes: * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset * `meta`: a dictionary containing the metadata as specified by Konect * each attribute specified by Konect (ent.* file) Example ------- >>> from sknetwork.data import load_konect >>> graph = load_konect('dolphins') >>> graph.adjacency.shape (62, 62) Notes ----- An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case, `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function. """ if dataset == '': raise ValueError( "Please specify the dataset. " + "\nExamples include 'actor-movie' and 'ego-facebook'." + "\n See 'http://konect.uni-koblenz.de' for the full list.") if data_home is None: data_home = get_data_home() data_path = data_home + '/' + dataset + '/' if not exists(data_path): makedirs(data_path, exist_ok=True) try: urlretrieve( 'http://konect.uni-koblenz.de/downloads/tsv/' + dataset + '.tar.bz2', data_home + '/' + dataset + '.tar.bz2') with tarfile.open(data_home + '/' + dataset + '.tar.bz2', 'r:bz2') as tar_ref: tar_ref.extractall(data_home) except (HTTPError, tarfile.ReadError): rmdir(data_home + '/' + dataset) raise ValueError( 'Invalid dataset ' + dataset + '.' + "\nExamples include 'actor-movie' and 'ego-facebook'." + "\n See 'http://konect.uni-koblenz.de' for the full list.") finally: remove(data_home + '/' + dataset + '.tar.bz2') elif exists(data_path + '/' + dataset + '_bundle'): return load_from_numpy_bundle(dataset + '_bundle', data_path) data = Bunch() files = [file for file in listdir(data_path) if dataset in file] matrix = [file for file in files if 'out.' in file] if matrix: file = matrix[0] directed, bipartite, weighted = parse_header(data_path + file) if bipartite: graph = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted) data.biadjacency = graph.biadjacency else: graph = parse_tsv(data_path + file, directed=directed, bipartite=bipartite, weighted=weighted) data.adjacency = graph.adjacency metadata = [file for file in files if 'meta.' in file] if metadata: file = metadata[0] data.meta = parse_metadata(data_path + file) attributes = [file for file in files if 'ent.' + dataset in file] if attributes: for file in attributes: attribute_name = file.split('.')[-1] data[attribute_name] = parse_labels(data_path + file) if hasattr(data, 'meta'): if hasattr(data.meta, 'name'): pass else: data.meta.name = dataset else: data.meta = Bunch() data.meta.name = dataset if auto_numpy_bundle: save_to_numpy_bundle(data, dataset + '_bundle', data_path) return data
def load_wikilinks_dataset(dataset_name: str, data_home: Optional[str] = None, max_depth: int = 1, full_path: bool = True): """ Loads a dataset from the `WikiLinks database <https://graphs.telecom-paristech.fr/Home_page.html#wikilinks-section>`_. Parameters ---------- dataset_name: str The name of the dataset (all lowcase). Currently, 'wikivitals' and 'wikihumans' are available. data_home: str The folder to be used for dataset storage max_depth: int Denotes the maximum depth to use for the categories (if relevant) full_path: bool Denotes if only the deepest label possible should be returned or if all super categories should be considered (if relevant) Returns ------- data: :class:`Bunch` An object with some of the following attributes (depending on the dataset): * `adjacency`: the adjacency matrix of the graph in CSR format * `biadjacency`: the biadjacency matrix of the graph in CSR format * `feature_names`: the array of the names for the features * `names`: the titles of the articles * `target_names`: the categories of the articles as specified with `max_depth` and `full_path` * `target`: the index for `target_names` """ if data_home is None: data_home = get_data_home() data_path = data_home + '/' + dataset_name + '/' if not exists(data_path): makedirs(data_path, exist_ok=True) try: urlretrieve("https://graphs.telecom-paristech.fr/npz_datasets/" + dataset_name + '_npz.tar.gz', data_home + '/' + dataset_name + '_npz.tar.gz') except HTTPError: raise ValueError('Invalid dataset ' + dataset_name) with tarfile.open(data_home + '/' + dataset_name + '_npz.tar.gz', 'r:gz') as tar_ref: tar_ref.extractall(data_home) remove(data_home + '/' + dataset_name + '_npz.tar.gz') data = Bunch() files = [file for file in listdir(data_path)] if 'adjacency.npz' in files: data.adjacency = sparse.load_npz(data_path + '/adjacency.npz') if 'biadjacency.npz' in files: data.biadjacency = sparse.load_npz(data_path + '/biadjacency.npz') if 'names.npy' in files: data.names = np.load(data_path + '/names.npy') if 'feature_names.npy' in files: data.feature_names = np.load(data_path + '/feature_names.npy') if 'target_names.npy' in files: tmp_target_names = np.load(data_path + '/target_names.npy') tags = [] for tag in tmp_target_names: parts = tag.strip().split('.') if full_path: tags.append(".".join(parts[:min(max_depth, len(parts))])) else: tags.append(parts[:min(max_depth, len(parts))][-1]) data.target_names = np.array(tags) _, data.target = np.unique(data.target_names, return_inverse=True) return data
def load_edge_list(file: str, directed: bool = False, bipartite: bool = False, weighted: Optional[bool] = None, named: Optional[bool] = None, comment: str = '%#', delimiter: str = None, reindex: bool = True, fast_format: bool = True) -> Bunch: """Parser for Tabulation-Separated, Comma-Separated or Space-Separated (or other) Values datasets in the form of edge lists. Parameters ---------- file : str The path to the dataset in TSV format directed : bool If ``True``, considers the graph as directed. bipartite : bool If ``True``, returns a biadjacency matrix of shape (n1, n2). weighted : Optional[bool] Retrieves the weights in the third field of the file. None makes a guess based on the first lines. named : Optional[bool] Retrieves the names given to the nodes and renumbers them. Returns an additional array. None makes a guess based on the first lines. comment : str Set of characters denoting lines to ignore. delimiter : str delimiter used in the file. None makes a guess reindex : bool If True and the graph nodes have numeric values, the size of the returned adjacency will be determined by the maximum of those values. Does not work for bipartite graphs. fast_format : bool If True, assumes that the file is well-formatted: * no comments except for the header * only 2 or 3 columns * only int or float values Returns ------- graph: :class:`Bunch` """ reindexed = False header_len, guess_delimiter, guess_weighted, guess_named, guess_string_present, guess_type = scan_header(file, comment) if weighted is None: weighted = guess_weighted if named is None: named = guess_named if delimiter is None: delimiter = guess_delimiter with open(file, 'r', encoding='utf-8') as f: for i in range(header_len): f.readline() if fast_format and not guess_string_present: # fromfile raises a DeprecationWarning on fail. This should be changed to ValueError in the future. warnings.filterwarnings("error") try: parsed = np.fromfile(f, sep=guess_delimiter, dtype=guess_type) except (DeprecationWarning, ValueError): raise ValueError('File not suitable for fast parsing. Set fast_format to False.') warnings.filterwarnings("default") n_entries = len(parsed) if weighted: parsed.resize((n_entries//3, 3)) row, col, data = parsed[:, 0], parsed[:, 1], parsed[:, 2] else: parsed.resize((n_entries//2, 2)) row, col = parsed[:, 0], parsed[:, 1] data = np.ones(row.shape[0], dtype=bool) else: row, col, data = [], [], [] csv_reader = reader(f, delimiter=delimiter) for line in csv_reader: if line[0] not in comment: if named: row.append(line[0]) col.append(line[1]) else: row.append(int(line[0])) col.append(int(line[1])) if weighted: data.append(float(line[2])) n_edges = len(row) graph = Bunch() if bipartite: names_row, row = np.unique(row, return_inverse=True) names_col, col = np.unique(col, return_inverse=True) if not reindex: n_row = names_row.max() + 1 n_col = names_col.max() + 1 else: n_row = len(names_row) n_col = len(names_col) if not weighted: data = np.ones(n_edges, dtype=bool) biadjacency = sparse.csr_matrix((data, (row, col)), shape=(n_row, n_col)) graph.biadjacency = biadjacency if named or reindex: graph.names = names_row graph.names_row = names_row graph.names_col = names_col else: nodes = np.concatenate((row, col), axis=None) names, new_nodes = np.unique(nodes, return_inverse=True) if not reindex: n_nodes = names.max() + 1 else: n_nodes = len(names) if named: row = new_nodes[:n_edges] col = new_nodes[n_edges:] else: should_reindex = not (names[0] == 0 and names[-1] == n_nodes - 1) if should_reindex and reindex: reindexed = True row = new_nodes[:n_edges] col = new_nodes[n_edges:] if not weighted: data = np.ones(n_edges, dtype=bool) adjacency = sparse.csr_matrix((data, (row, col)), shape=(n_nodes, n_nodes)) if not directed: adjacency = directed2undirected(adjacency, weighted=weighted) graph.adjacency = adjacency if named or reindexed: graph.names = names return graph
def karate_club(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Karate club graph. * Undirected graph * 34 nodes, 78 edges * 2 labels Parameters ---------- metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (labels, positions). Example ------- >>> from sknetwork.data import karate_club >>> adjacency = karate_club() >>> adjacency.shape (34, 34) References ---------- Zachary's karate club graph https://en.wikipedia.org/wiki/Zachary%27s_karate_club """ row = np.array( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, 14, 15, 15, 18, 18, 19, 20, 20, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32]) col = np.array( [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2, 3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12, 13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32, 33, 32, 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 31, 29, 33, 33, 31, 33, 32, 33, 32, 33, 32, 33, 33]) adjacency = sparse.csr_matrix((np.ones(len(row), dtype=bool), (row, col)), shape=(34, 34)) adjacency = sparse.csr_matrix(adjacency + adjacency.T, dtype=bool) if metadata: labels = np.array( [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) x = np.array( [0.04, 0.24, 0.01, 0.13, 0.02, -0.08, 0.04, 0.21, 0.08, -0.11, -0.13, -0.28, 0.2, 0.08, 0.23, 0.06, -0.06, 0.32, 0.15, 0.19, 0.27, 0.39, -0.04, -0.26, -0.51, -0.49, -0.19, -0.28, -0.11, -0.17, 0.22, -0.21, 0.03, 0]) y = np.array( [-0.33, -0.15, -0.01, -0.28, -0.64, -0.75, -0.76, -0.25, 0.09, 0.23, -0.62, -0.4, -0.53, -0.07, 0.55, 0.64, -1., -0.42, 0.6, -0.01, 0.45, -0.34, 0.61, 0.41, 0.14, 0.28, 0.68, 0.21, 0.12, 0.54, 0.19, 0.09, 0.38, 0.33]) graph = Bunch() graph.adjacency = adjacency graph.labels = labels graph.position = np.vstack((x, y)).T graph.name = 'karate_club' return graph else: return adjacency
def miserables(metadata: bool = False) -> Union[sparse.csr_matrix, Bunch]: """Co-occurrence graph of the characters in the novel Les miserables by Victor Hugo. * Undirected graph * 77 nodes, 508 edges * Names of characters Parameters ---------- metadata : If ``True``, return a `Bunch` object with metadata. Returns ------- adjacency or graph : Union[sparse.csr_matrix, Bunch] Adjacency matrix or graph with metadata (names, positions). Example ------- >>> from sknetwork.data import miserables >>> adjacency = miserables() >>> adjacency.shape (77, 77) """ row = np.array( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 3, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 21, 21, 22, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 29, 29, 29, 30, 34, 34, 34, 34, 35, 35, 35, 36, 36, 37, 39, 39, 41, 41, 41, 41, 41, 41, 41, 41, 41, 46, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 51, 51, 51, 51, 54, 55, 55, 55, 55, 55, 55, 55, 55, 55, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 65, 65, 66, 68, 68, 68, 68, 69, 69, 69, 70, 70, 71, 73]) col = np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 3, 11, 11, 11, 12, 13, 14, 15, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 43, 44, 48, 49, 51, 55, 58, 64, 68, 69, 70, 71, 72, 23, 17, 18, 19, 20, 21, 22, 23, 26, 55, 18, 19, 20, 21, 22, 23, 19, 20, 21, 22, 23, 20, 21, 22, 23, 21, 22, 23, 22, 23, 23, 24, 25, 27, 29, 30, 31, 25, 26, 27, 41, 42, 50, 68, 69, 70, 26, 27, 39, 40, 41, 42, 48, 55, 68, 69, 70, 71, 75, 27, 43, 49, 51, 54, 55, 72, 28, 29, 31, 33, 43, 48, 58, 68, 69, 70, 71, 72, 44, 45, 34, 35, 36, 37, 38, 31, 35, 36, 37, 38, 36, 37, 38, 37, 38, 38, 52, 55, 42, 55, 57, 62, 68, 69, 70, 71, 75, 47, 48, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 71, 73, 74, 75, 76, 50, 51, 54, 55, 56, 52, 53, 54, 55, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 58, 59, 61, 62, 63, 64, 65, 67, 59, 60, 61, 62, 63, 64, 65, 66, 70, 76, 60, 61, 62, 63, 64, 65, 66, 61, 62, 63, 64, 65, 66, 62, 63, 64, 65, 66, 63, 64, 65, 66, 76, 64, 65, 66, 76, 65, 66, 76, 66, 76, 76, 69, 70, 71, 75, 70, 71, 75, 71, 75, 75, 74]) data = np.array( [1, 8, 10, 1, 1, 1, 1, 2, 1, 5, 6, 3, 3, 1, 1, 1, 1, 1, 9, 7, 12, 31, 17, 8, 2, 3, 1, 2, 3, 3, 2, 2, 2, 3, 1, 1, 2, 2, 19, 4, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 3, 3, 3, 3, 1, 1, 4, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 5, 4, 4, 4, 4, 4, 2, 1, 5, 1, 1, 2, 13, 4, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 3, 2, 1, 2, 5, 6, 4, 1, 3, 1, 1, 3, 2, 1, 21, 2, 1, 1, 1, 1, 1, 1, 6, 1, 2, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 7, 6, 1, 2, 7, 5, 5, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 9, 1, 12, 1, 1, 1, 2, 6, 1, 1, 1, 7, 5, 1, 9, 1, 5, 2, 1, 2, 1, 2, 2, 1, 1, 3, 15, 4, 6, 17, 4, 10, 5, 3, 1, 1, 2, 5, 13, 5, 9, 5, 1, 2, 3, 2, 2, 2, 1, 6, 3, 6, 5, 1, 6, 12, 5, 2, 1, 4, 5, 1, 1, 7, 3, 1, 2, 1, 1, 6, 4, 2, 3, 4, 2, 3, 2, 1, 1, 3]) adjacency = sparse.csr_matrix((data, (row, col)), shape=(77, 77)) adjacency = adjacency + adjacency.T if metadata: names = ['Myriel', 'Napoleon', 'Mlle Baptistine', 'Mme Magloire', 'Countess de Lo', 'Geborand', 'Champtercier', 'Cravatte', 'Count', 'Old man', 'Labarre', 'Valjean', 'Marguerite', 'Mme Der', 'Isabeau', 'Gervais', 'Tholomyes', 'Listolier', 'Fameuil', 'Blacheville', 'Favourite', 'Dahlia', 'Zephine', 'Fantine', 'Mme Thenardier', 'Thenardier', 'Cosette', 'Javert', 'Fauchelevent', 'Bamatabois', 'Perpetue', 'Simplice', 'Scaufflaire', 'Woman1', 'Judge', 'Champmathieu', 'Brevet', 'Chenildieu', 'Cochepaille', 'Pontmercy', 'Boulatruelle', 'Eponine', 'Anzelma', 'Woman2', 'Mother Innocent', 'Gribier', 'Jondrette', 'Mme Burgon', 'Gavroche', 'Gillenormand', 'Magnon', 'Mlle Gillenormand', 'Mme Pontmercy', 'Mlle Vaubois', 'Lt Gillenormand', 'Marius', 'Baroness', 'Mabeuf', 'Enjolras', 'Combeferre', 'Prouvaire', 'Feuilly', 'Courfeyrac', 'Bahorel', 'Bossuet', 'Joly', 'Grantaire', 'MotherPlutarch', 'Gueulemer', 'Babet', 'Claquesous', 'Montparnasse', 'Toussaint', 'Child1', 'Child2', 'Brujon', 'Mme Hucheloup'] x = np.array( [0.53, 0.98, 0.41, 0.4, 1., 0.92, 0.84, 0.74, 0.78, 1., 0.51, 0.09, -0., 0.29, 0.37, 0.41, -0.35, -0.46, -0.42, -0.46, -0.41, -0.37, -0.36, -0.2, -0.06, -0.04, -0.01, -0.02, 0.33, 0.17, -0.29, -0.1, 0.58, 0.29, 0.29, 0.26, 0.29, 0.37, 0.35, 0.04, -0.01, -0.18, -0.09, 0.2, 0.51, 0.7, -0.95, -0.7, -0.37, -0.08, -0.18, -0.05, 0.04, -0.12, -0.06, -0.13, -0.24, -0.48, -0.25, -0.33, -0.43, -0.39, -0.33, -0.42, -0.31, -0.38, -0.48, -0.74, -0.08, -0.1, -0.02, -0.1, 0.14, -0.76, -0.75, -0.18, -0.58]) y = np.array( [-0.23, -0.42, -0.14, -0.18, -0.31, -0.52, -0.6, -0.65, -0.38, -0.19, 0.39, 0.03, 0.44, -0.44, 0.51, -0.36, 0.27, 0.37, 0.4, 0.32, 0.32, 0.36, 0.4, 0.2, 0.07, 0.14, -0.05, 0.06, 0.06, 0.24, -0.26, -0.1, 0.24, -0.04, 0.17, 0.23, 0.31, 0.21, 0.27, -0.36, 0.69, 0.11, 0.38, -0.09, 0.05, 0.12, 0.82, 0.44, 0.06, -0.2, -0.4, -0.28, -0.68, -0.79, -0.4, -0.07, -0.51, -0.17, -0.03, -0.09, -0.14, -0.04, -0.04, -0.07, -0.06, -0.11, -0.06, -0.35, 0.24, 0.19, 0.22, 0.29, -0.2, 0.06, 0.14, 0.3, -0.1]) graph = Bunch() graph.adjacency = adjacency graph.names = np.array(names) graph.position = np.vstack((x, y)).T graph.name = 'miserables' return graph else: return adjacency