Beispiel #1
0
def load_konect(dataset: str, data_home: Optional[Union[str, Path]] = None, auto_numpy_bundle: bool = True,
                verbose: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.cc/networks/>`_.

    Parameters
    ----------
    dataset : str
        The internal name of the dataset as specified on the Konect website (e.g. for the Zachary Karate club dataset,
        the corresponding name is ``'ucidata-zachary'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).
    verbose : bool
        Enable verbosity.

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.

    References
    ----------
    Kunegis, J. (2013, May).
    `Konect: the Koblenz network collection.
    <https://dl.acm.org/doi/abs/10.1145/2487788.2488173>`_
    In Proceedings of the 22nd International Conference on World Wide Web (pp. 1343-1350).
    """
    logger = Log(verbose)
    if dataset == '':
        raise ValueError("Please specify the dataset. "
                         + "\nExamples include 'actor-movie' and 'ego-facebook'."
                         + "\n See 'http://konect.cc/networks/' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        logger.print('Downloading', dataset, 'from Konect...')
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve('http://konect.cc/files/download.tsv.' + dataset + '.tar.bz2',
                        data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'), 'r:bz2') as tar_ref:
                logger.print('Unpacking archive...')
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError('Invalid dataset ' + dataset + '.'
                             + "\nExamples include 'actor-movie' and 'ego-facebook'."
                             + "\n See 'http://konect.cc/networks/' for the full list.")
        except (URLError, ConnectionResetError):  # pragma: no cover
            rmdir(data_path)
            raise RuntimeError("Could not reach Konect.")
        finally:
            if exists(data_home / (dataset + '.tar.bz2')):
                remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        logger.print('Loading from local bundle...')
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]
    logger.print('Parsing files...')
    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file, directed=directed, bipartite=bipartite, weighted_input=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data
Beispiel #2
0
def load_konect(dataset: str,
                data_home: Optional[Union[str, Path]] = None,
                auto_numpy_bundle: bool = True) -> Bunch:
    """Load a dataset from the `Konect database
    <http://konect.uni-koblenz.de>`_.

    Parameters
    ----------
    dataset : str
        The name of the dataset as specified in the download link (e.g. for the Actor movies dataset, the corresponding
        name is ``'actor-movie'``).
    data_home : str or :class:`pathlib.Path`
        The folder to be used for dataset storage
    auto_numpy_bundle : bool
        Denotes if the dataset should be stored in its default format (False) or using Numpy files for faster
        subsequent access to the dataset (True).

    Returns
    -------
    graph : :class:`Bunch`
        An object with the following attributes:

             * `adjacency` or `biadjacency`: the adjacency/biadjacency matrix for the dataset
             * `meta`: a dictionary containing the metadata as specified by Konect
             * each attribute specified by Konect (ent.* file)

    Example
    -------
    >>> from sknetwork.data import load_konect
    >>> graph = load_konect('dolphins')
    >>> graph.adjacency.shape
    (62, 62)

    Notes
    -----
    An attribute `meta` of the `Bunch` class is used to store information about the dataset if present. In any case,
    `meta` has the attribute `name` which, if not given, is equal to the name of the dataset as passed to this function.
    """
    if dataset == '':
        raise ValueError(
            "Please specify the dataset. " +
            "\nExamples include 'actor-movie' and 'ego-facebook'." +
            "\n See 'http://konect.uni-koblenz.de' for the full list.")
    data_home = get_data_home(data_home)
    data_path = data_home / dataset
    if not data_path.exists():
        makedirs(data_path, exist_ok=True)
        try:
            urlretrieve(
                'http://konect.uni-koblenz.de/downloads/tsv/' + dataset +
                '.tar.bz2', data_home / (dataset + '.tar.bz2'))
            with tarfile.open(data_home / (dataset + '.tar.bz2'),
                              'r:bz2') as tar_ref:
                tar_ref.extractall(data_home)
        except (HTTPError, tarfile.ReadError):
            rmdir(data_path)
            raise ValueError(
                'Invalid dataset ' + dataset + '.' +
                "\nExamples include 'actor-movie' and 'ego-facebook'." +
                "\n See 'http://konect.uni-koblenz.de' for the full list.")
        finally:
            remove(data_home / (dataset + '.tar.bz2'))
    elif exists(data_path / (dataset + '_bundle')):
        return load_from_numpy_bundle(dataset + '_bundle', data_path)

    data = Bunch()

    files = [file for file in listdir(data_path) if dataset in file]

    matrix = [file for file in files if 'out.' in file]
    if matrix:
        file = matrix[0]
        directed, bipartite, weighted = load_header(data_path / file)
        if bipartite:
            graph = load_edge_list(data_path / file,
                                   directed=directed,
                                   bipartite=bipartite,
                                   weighted=weighted)
            data.biadjacency = graph.biadjacency
        else:
            graph = load_edge_list(data_path / file,
                                   directed=directed,
                                   bipartite=bipartite,
                                   weighted=weighted)
            data.adjacency = graph.adjacency

    metadata = [file for file in files if 'meta.' in file]
    if metadata:
        file = metadata[0]
        data.meta = load_metadata(data_path / file)

    attributes = [file for file in files if 'ent.' + dataset in file]
    if attributes:
        for file in attributes:
            attribute_name = file.split('.')[-1]
            data[attribute_name] = load_labels(data_path / file)

    if hasattr(data, 'meta'):
        if hasattr(data.meta, 'name'):
            pass
        else:
            data.meta.name = dataset
    else:
        data.meta = Bunch()
        data.meta.name = dataset

    if auto_numpy_bundle:
        save_to_numpy_bundle(data, dataset + '_bundle', data_path)

    return data