Beispiel #1
0
 def test_bipartite_edge_list(self):
     self.stub_data_9 = 'stub_9.txt'
     with open(self.stub_data_9, "w") as text_file:
         text_file.write('%stub\n1 3\n4 5\n0 3')
     graph = parse.load_edge_list(self.stub_data_9, bipartite=True)
     biadjacency = graph.biadjacency
     self.assertTrue((biadjacency.indices == [0, 0, 1]).all())
     self.assertTrue((biadjacency.indptr == [0, 1, 2, 3]).all())
     self.assertTrue((biadjacency.data == [1, 1, 1]).all())
     remove(self.stub_data_9)
Beispiel #2
0
 def test_unlabeled_unweighted(self):
     self.stub_data_1 = 'stub_1.txt'
     with open(self.stub_data_1, "w") as text_file:
         text_file.write('%stub\n1 3\n4 5\n0 2')
     graph = parse.load_edge_list(self.stub_data_1)
     adjacency = graph.adjacency
     self.assertTrue((adjacency.indices == [2, 3, 0, 1, 5, 4]).all())
     self.assertTrue((adjacency.indptr == [0, 1, 2, 3, 4, 5, 6]).all())
     self.assertTrue((adjacency.data == [1, 1, 1, 1, 1, 1]).all())
     remove(self.stub_data_1)
Beispiel #3
0
 def test_auto_reindex(self):
     self.stub_data_4 = 'stub_4.txt'
     with open(self.stub_data_4, "w") as text_file:
         text_file.write('%stub\n14 31\n42 50\n0 12')
     graph = parse.load_edge_list(self.stub_data_4)
     adjacency = graph.adjacency
     names = graph.names
     self.assertTrue((adjacency.indices == [1, 0, 3, 2, 5, 4]).all())
     self.assertTrue((adjacency.indptr == [0, 1, 2, 3, 4, 5, 6]).all())
     self.assertTrue((adjacency.data == [1, 1, 1, 1, 1, 1]).all())
     self.assertTrue((names == [0, 12, 14, 31, 42, 50]).all())
     remove(self.stub_data_4)
Beispiel #4
0
 def test_labeled_weighted(self):
     self.stub_data_2 = 'stub_2.txt'
     with open(self.stub_data_2, "w") as text_file:
         text_file.write('%stub\nf, e, 5\na, d, 6\nc, b, 1')
     graph = parse.load_edge_list(self.stub_data_2)
     adjacency = graph.adjacency
     names = graph.names
     self.assertTrue((adjacency.indices == [4, 3, 5, 1, 0, 2]).all())
     self.assertTrue((adjacency.indptr == [0, 1, 2, 3, 4, 5, 6]).all())
     self.assertTrue((adjacency.data == [1, 6, 5, 6, 1, 5]).all())
     self.assertTrue((names == [' b', ' d', ' e', 'a', 'c', 'f']).all())
     remove(self.stub_data_2)
Beispiel #5
0
def load_konect(dataset: str, data_home: Optional[Union[str, Path]] = None, auto_numpy_bundle: bool = True,
                verbose: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.cc/networks/>`_.

    Parameters
    ----------
    dataset : str
        The internal name of the dataset as specified on the Konect website (e.g. for the Zachary Karate club dataset,
        the corresponding name is ``'ucidata-zachary'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).
    verbose : bool
        Enable verbosity.

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.

    References
    ----------
    Kunegis, J. (2013, May).
    `Konect: the Koblenz network collection.
    <https://dl.acm.org/doi/abs/10.1145/2487788.2488173>`_
    In Proceedings of the 22nd International Conference on World Wide Web (pp. 1343-1350).
    """
    logger = Log(verbose)
    if dataset == '':
        raise ValueError("Please specify the dataset. "
                         + "\nExamples include 'actor-movie' and 'ego-facebook'."
                         + "\n See 'http://konect.cc/networks/' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        logger.print('Downloading', dataset, 'from Konect...')
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.cc/files/download.tsv.' + dataset + '.tar.bz2',
                        data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'), 'r:bz2') as tar_ref:
                logger.print('Unpacking archive...')
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError('Invalid dataset ' + dataset + '.'
                             + "\nExamples include 'actor-movie' and 'ego-facebook'."
                             + "\n See 'http://konect.cc/networks/' for the full list.")
        except (URLError, ConnectionResetError):  # pragma: no cover
            rmdir(data_path)
            raise RuntimeError("Could not reach Konect.")
        finally:
            if exists(data_home / (dataset + '.tar.bz2')):
                remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        logger.print('Loading from local bundle...')
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]
    logger.print('Parsing files...')
    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data
Beispiel #6
0
def load_konect(dataset: str,
                data_home: Optional[Union[str, Path]] = None,
                auto_numpy_bundle: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Example
    -------
    >>> from sknetwork.data import load_konect
    >>> graph = load_konect('dolphins')
    >>> graph.adjacency.shape
    (62, 62)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.
    """
    if dataset == '':
        raise ValueError(
            "Please specify the dataset. " +
            "\nExamples include 'actor-movie' and 'ego-facebook'." +
            "\n See 'http://konect.uni-koblenz.de' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                'http://konect.uni-koblenz.de/downloads/tsv/' + dataset +
                '.tar.bz2', data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'),
                              'r:bz2') as tar_ref:
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError(
                'Invalid dataset ' + dataset + '.' +
                "\nExamples include 'actor-movie' and 'ego-facebook'." +
                "\n See 'http://konect.uni-koblenz.de' for the full list.")
        finally:
            remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file,
                                   directed=directed,
                                   bipartite=bipartite,
                                   weighted=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file,
                                   directed=directed,
                                   bipartite=bipartite,
                                   weighted=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data