Ejemplo n.º 1
0
def ds_traverse(rootds, parent=None, json=None,
                recurse_datasets=False, recurse_directories=False,
                long_=False):
    """Hierarchical dataset traverser

    Parameters
    ----------
    rootds: Dataset
      Root dataset to be traversed
    parent: Dataset
      Parent dataset of the current rootds
    recurse_datasets: bool
      Recurse into subdatasets of the root dataset
    recurse_directories: bool
      Recurse into subdirectories of the current dataset
      In both of above cases, if False, they will not be explicitly
      recursed but data would be loaded from their meta-data files

    Returns
    -------
    list of dict
      extracts and returns a (recursive) list of dataset(s) info at path
    """
    # extract parent info to pass to traverser
    fsparent = fs_extract(parent.path, parent.repo, basepath=rootds.path) \
        if parent else None

    # (recursively) traverse file tree of current dataset
    fs = fs_traverse(
        rootds.path, rootds.repo,
        subdatasets=list(rootds.subdatasets(result_xfm='relpaths')),
        render=False,
        parent=fsparent,
        # XXX note that here I kinda flipped the notions!
        recurse_datasets=recurse_datasets,
        recurse_directories=recurse_directories,
        json=json
    )

    # BUT if we are recurse_datasets but not recurse_directories
    #     we need to handle those subdatasets then somehow since
    #     otherwise we might not even get to them?!

    fs['nodes'][0]['size'] = fs['size']  # update self's updated size in nodes sublist too!

    # add dataset specific entries to its dict
    rootds_model = GitModel(rootds.repo)
    fs['tags'] = rootds_model.describe
    fs['branch'] = rootds_model.branch
    index_file = opj(rootds.path, '.git', 'index')
    fs['index-mtime'] = time.strftime(
        u"%Y-%m-%d %H:%M:%S",
        time.localtime(getmtime(index_file))) if exists(index_file) else ''

    # render current dataset
    lgr.info('Dataset: %s' % rootds.path)
    fs_render(fs, json=json, ds_path=rootds.path)
    return fs
Ejemplo n.º 2
0
def _traverse_handle_subds(
        subds_rpath, rootds,
        recurse_datasets, recurse_directories, json):
    """A helper to deal with the subdataset node - recurse or just pick up
    may be alrady collected in it web meta
    """
    subds_path = opj(rootds.path, subds_rpath)
    subds = Dataset(subds_path)
    subds_json = metadata_locator(path='.', ds_path=subds_path)

    def handle_not_installed():
        # for now just traverse as fs
        lgr.warning("%s is either not installed or lacks meta-data", subds)
        subfs = fs_extract(subds_path, rootds, basepath=rootds.path)
        # but add a custom type that it is a not installed subds
        subfs['type'] = 'uninitialized'
        # we need to kick it out from 'children'
        # TODO:  this is inefficient and cruel -- "ignored" should be made
        # smarted to ignore submodules for the repo
        #if fs['nodes']:
        #    fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath]
        return subfs

    if not subds.is_installed():
        subfs = handle_not_installed()
    elif recurse_datasets:
        subfs = ds_traverse(subds,
                            json=json,
                            recurse_datasets=recurse_datasets,
                            recurse_directories=recurse_directories,
                            parent=rootds)
        subfs.pop('nodes', None)
        #size_list.append(subfs['size'])
    # else just pick the data from metadata_file of each subdataset
    else:
        subfs = None
        lgr.info(subds.path)
        if exists(subds_json):
            with open(subds_json) as data_file:
                subfs = js.load(data_file)
                subfs.pop('nodes', None)  # remove children
                subfs['path'] = subds_rpath  # reassign the path
                #size_list.append(subfs['size'])
        else:
            # the same drill as if not installed
            lgr.warning("%s is installed but no meta-data yet", subds)
            subfs = handle_not_installed()
    # add URL field

    return subfs
Ejemplo n.º 3
0
def _traverse_handle_subds(subds_rpath, rootds, recurse_datasets,
                           recurse_directories, json):
    """A helper to deal with the subdataset node - recurse or just pick up
    may be alrady collected in it web meta
    """
    subds_path = opj(rootds.path, subds_rpath)
    subds = Dataset(subds_path)
    subds_json = metadata_locator(path='.', ds_path=subds_path)

    def handle_not_installed():
        # for now just traverse as fs
        lgr.warning("%s is either not installed or lacks meta-data", subds)
        subfs = fs_extract(subds_path, rootds, basepath=rootds.path)
        # but add a custom type that it is a not installed subds
        subfs['type'] = 'uninitialized'
        # we need to kick it out from 'children'
        # TODO:  this is inefficient and cruel -- "ignored" should be made
        # smarted to ignore submodules for the repo
        #if fs['nodes']:
        #    fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath]
        return subfs

    if not subds.is_installed():
        subfs = handle_not_installed()
    elif recurse_datasets:
        subfs = ds_traverse(subds,
                            json=json,
                            recurse_datasets=recurse_datasets,
                            recurse_directories=recurse_directories,
                            parent=rootds)
        subfs.pop('nodes', None)
        #size_list.append(subfs['size'])
    # else just pick the data from metadata_file of each subdataset
    else:
        subfs = None
        lgr.info(subds.path)
        if exists(subds_json):
            with open(subds_json) as data_file:
                subfs = js.load(data_file)
                subfs.pop('nodes', None)  # remove children
                subfs['path'] = subds_rpath  # reassign the path
                #size_list.append(subfs['size'])
        else:
            # the same drill as if not installed
            lgr.warning("%s is installed but no meta-data yet", subds)
            subfs = handle_not_installed()
    # add URL field

    return subfs
Ejemplo n.º 4
0
def ds_traverse(rootds,
                parent=None,
                json=None,
                recurse_datasets=False,
                recurse_directories=False,
                long_=False):
    """Hierarchical dataset traverser

    Parameters
    ----------
    rootds: Dataset
      Root dataset to be traversed
    parent: Dataset
      Parent dataset of the current rootds
    recurse_datasets: bool
      Recurse into subdatasets of the root dataset
    recurse_directories: bool
      Recurse into subdirectories of the current dataset
      In both of above cases, if False, they will not be explicitly
      recursed but data would be loaded from their meta-data files

    Returns
    -------
    list of dict
      extracts and returns a (recursive) list of dataset(s) info at path
    """
    # extract parent info to pass to traverser
    fsparent = fs_extract(parent.path, parent.repo, basepath=rootds.path) \
        if parent else None

    # (recursively) traverse file tree of current dataset
    fs = fs_traverse(
        rootds.path,
        rootds.repo,
        subdatasets=list(rootds.subdatasets(result_xfm='relpaths')),
        render=False,
        parent=fsparent,
        # XXX note that here I kinda flipped the notions!
        recurse_datasets=recurse_datasets,
        recurse_directories=recurse_directories,
        json=json)

    # BUT if we are recurse_datasets but not recurse_directories
    #     we need to handle those subdatasets then somehow since
    #     otherwise we might not even get to them?!

    fs['nodes'][0]['size'] = fs[
        'size']  # update self's updated size in nodes sublist too!

    # add dataset specific entries to its dict
    rootds_model = GitModel(rootds.repo)
    fs['tags'] = rootds_model.describe
    fs['branch'] = rootds_model.branch
    index_file = opj(rootds.path, '.git', 'index')
    fs['index-mtime'] = time.strftime(
        u"%Y-%m-%d %H:%M:%S", time.localtime(
            getmtime(index_file))) if exists(index_file) else ''

    # render current dataset
    lgr.info('Dataset: %s' % rootds.path)
    fs_render(fs, json=json, ds_path=rootds.path)
    return fs
Ejemplo n.º 5
0
def fs_traverse(path,
                repo,
                parent=None,
                subdatasets=None,
                render=True,
                recurse_datasets=False,
                recurse_directories=False,
                json=None,
                basepath=None):
    """Traverse path through its nodes and returns a dictionary of relevant
    attributes attached to each node

    Parameters
    ----------
    path: str
      Path to the directory to be traversed
    repo: AnnexRepo or GitRepo
      Repo object the directory belongs too
    parent: dict
      Extracted info about parent directory
    recurse_directories: bool
      Recurse into subdirectories (note that subdatasets are not traversed)
    render: bool
       To render from within function or not. Set to false if results to be
       manipulated before final render

    Returns
    -------
    list of dict
      extracts and returns a (recursive) list of directory info at path
      does not traverse into annex, git or hidden directories
    """
    subdatasets = subdatasets or []
    fs = fs_extract(path, repo, basepath=basepath or path)
    dataset = Dataset(repo.path)
    submodules = {sm.path: sm for sm in repo.get_submodules()}
    # TODO:  some submodules might not even have a local empty directory
    # (git doesn't care about those), so us relying on listdir here and
    # for _traverse_handle_subds might not work out.
    # E.g. create-sibling --ui true ... --existing=reconfigure
    #  causes removal of those empty ones on the remote end
    if isdir(path):  # if node is a directory
        children = [
            fs.copy()
        ]  # store its info in its children dict too  (Yarik is not sure why, but I guess for .?)
        # ATM seems some pieces still rely on having this duplication, so left as is
        # TODO: strip away
        for node in listdir(path):
            nodepath = opj(path, node)

            # Might contain subdatasets, so we should analyze and prepare entries
            # to pass down... in theory we could just pass full paths may be? strip
            node_subdatasets = []
            is_subdataset = False
            if isdir(nodepath):
                node_sep = with_pathsep(node)
                for subds in subdatasets:
                    if subds == node:
                        # it is the subdataset
                        is_subdataset = True
                    else:
                        # use path_is_subdir
                        if subds.startswith(node_sep):
                            node_subdatasets += [subds[len(node_sep):]]

            # TODO:  it might be a subdir which is non-initialized submodule!
            # if not ignored, append child node info to current nodes dictionary
            if is_subdataset:
                # repo.path is real, so we are doomed (for now at least)
                # to resolve nodepath as well to get relpath for it
                node_relpath = relpath(realpath(nodepath), repo.path)
                subds = _traverse_handle_subds(
                    node_relpath,
                    dataset,
                    recurse_datasets=recurse_datasets,
                    recurse_directories=recurse_directories,
                    json=json)
                # Enhance it with external url if available
                submod_url = submodules[node_relpath].url
                if submod_url and is_datalad_compat_ri(submod_url):
                    subds['url'] = submod_url
                children.append(subds)
            elif not ignored(nodepath):
                # if recursive, create info dictionary (within) each child node too
                if recurse_directories:
                    subdir = fs_traverse(
                        nodepath,
                        repo,
                        subdatasets=node_subdatasets,
                        parent=None,  # children[0],
                        recurse_datasets=recurse_datasets,
                        recurse_directories=recurse_directories,
                        json=json,
                        basepath=basepath or path)
                    subdir.pop('nodes', None)
                else:
                    # read child metadata from its metadata file if it exists
                    subdir_json = metadata_locator(path=node,
                                                   ds_path=basepath or path)
                    if exists(subdir_json):
                        with open(subdir_json) as data_file:
                            subdir = js.load(data_file)
                            subdir.pop('nodes', None)
                    # else extract whatever information you can about the child
                    else:
                        # Yarik: this one is way too lean...
                        subdir = fs_extract(nodepath,
                                            repo,
                                            basepath=basepath or path)
                # append child metadata to list
                children.extend([subdir])

        # sum sizes of all 1st level children
        children_size = {}
        for node in children[1:]:
            for size_type, child_size in node['size'].items():
                children_size[size_type] = children_size.get(
                    size_type, 0) + machinesize(child_size)

        # update current node sizes to the humanized aggregate children size
        fs['size'] = children[0]['size'] = \
            {size_type: humanize.naturalsize(child_size)
             for size_type, child_size in children_size.items()}

        children[0][
            'name'] = '.'  # replace current node name with '.' to emulate unix syntax
        if parent:
            parent[
                'name'] = '..'  # replace parent node name with '..' to emulate unix syntax
            children.insert(
                1, parent
            )  # insert parent info after current node info in children dict

        fs['nodes'] = children  # add children info to main fs dictionary
        if render:  # render directory node at location(path)
            fs_render(fs, json=json, ds_path=basepath or path)
            lgr.info('Directory: %s' % path)

    return fs
Ejemplo n.º 6
0
def fs_traverse(path, repo, parent=None,
                subdatasets=None,
                render=True,
                recurse_datasets=False,
                recurse_directories=False,
                json=None, basepath=None):
    """Traverse path through its nodes and returns a dictionary of relevant
    attributes attached to each node

    Parameters
    ----------
    path: str
      Path to the directory to be traversed
    repo: AnnexRepo or GitRepo
      Repo object the directory belongs too
    parent: dict
      Extracted info about parent directory
    recurse_directories: bool
      Recurse into subdirectories (note that subdatasets are not traversed)
    render: bool
       To render from within function or not. Set to false if results to be
       manipulated before final render

    Returns
    -------
    list of dict
      extracts and returns a (recursive) list of directory info at path
      does not traverse into annex, git or hidden directories
    """
    subdatasets = subdatasets or []
    fs = fs_extract(path, repo, basepath=basepath or path)
    dataset = Dataset(repo.path)
    submodules = {sm.path: sm
                  for sm in repo.get_submodules()}
    # TODO:  some submodules might not even have a local empty directory
    # (git doesn't care about those), so us relying on listdir here and
    # for _traverse_handle_subds might not work out.
    # E.g. create-sibling --ui true ... --existing=reconfigure
    #  causes removal of those empty ones on the remote end
    if isdir(path):                     # if node is a directory
        children = [fs.copy()]          # store its info in its children dict too  (Yarik is not sure why, but I guess for .?)
        # ATM seems some pieces still rely on having this duplication, so left as is
        # TODO: strip away
        for node in listdir(path):
            nodepath = opj(path, node)

            # Might contain subdatasets, so we should analyze and prepare entries
            # to pass down... in theory we could just pass full paths may be? strip
            node_subdatasets = []
            is_subdataset = False
            if isdir(nodepath):
                node_sep = with_pathsep(node)
                for subds in subdatasets:
                    if subds == node:
                        # it is the subdataset
                        is_subdataset = True
                    else:
                        # use path_is_subdir
                        if subds.startswith(node_sep):
                            node_subdatasets += [subds[len(node_sep):]]

            # TODO:  it might be a subdir which is non-initialized submodule!
            # if not ignored, append child node info to current nodes dictionary
            if is_subdataset:
                # repo.path is real, so we are doomed (for now at least)
                # to resolve nodepath as well to get relpath for it
                node_relpath = relpath(realpath(nodepath), repo.path)
                subds = _traverse_handle_subds(
                    node_relpath,
                    dataset,
                    recurse_datasets=recurse_datasets,
                    recurse_directories=recurse_directories,
                    json=json
                )
                # Enhance it with external url if available
                submod_url = submodules[node_relpath].url
                if submod_url and is_datalad_compat_ri(submod_url):
                    subds['url'] = submod_url
                children.append(subds)
            elif not ignored(nodepath):
                # if recursive, create info dictionary (within) each child node too
                if recurse_directories:
                    subdir = fs_traverse(nodepath,
                                         repo,
                                         subdatasets=node_subdatasets,
                                         parent=None,  # children[0],
                                         recurse_datasets=recurse_datasets,
                                         recurse_directories=recurse_directories,
                                         json=json,
                                         basepath=basepath or path)
                    subdir.pop('nodes', None)
                else:
                    # read child metadata from its metadata file if it exists
                    subdir_json = metadata_locator(path=node, ds_path=basepath or path)
                    if exists(subdir_json):
                        with open(subdir_json) as data_file:
                            subdir = js.load(data_file)
                            subdir.pop('nodes', None)
                    # else extract whatever information you can about the child
                    else:
                        # Yarik: this one is way too lean...
                        subdir = fs_extract(nodepath,
                                            repo,
                                            basepath=basepath or path)
                # append child metadata to list
                children.extend([subdir])

        # sum sizes of all 1st level children
        children_size = {}
        for node in children[1:]:
            for size_type, child_size in node['size'].items():
                children_size[size_type] = children_size.get(size_type, 0) + machinesize(child_size)

        # update current node sizes to the humanized aggregate children size
        fs['size'] = children[0]['size'] = \
            {size_type: humanize.naturalsize(child_size)
             for size_type, child_size in children_size.items()}

        children[0]['name'] = '.'       # replace current node name with '.' to emulate unix syntax
        if parent:
            parent['name'] = '..'       # replace parent node name with '..' to emulate unix syntax
            children.insert(1, parent)  # insert parent info after current node info in children dict

        fs['nodes'] = children          # add children info to main fs dictionary
        if render:                      # render directory node at location(path)
            fs_render(fs, json=json, ds_path=basepath or path)
            lgr.info('Directory: %s' % path)

    return fs