def ds_traverse(rootds, parent=None, json=None, recurse_datasets=False, recurse_directories=False, long_=False): """Hierarchical dataset traverser Parameters ---------- rootds: Dataset Root dataset to be traversed parent: Dataset Parent dataset of the current rootds recurse_datasets: bool Recurse into subdatasets of the root dataset recurse_directories: bool Recurse into subdirectories of the current dataset In both of above cases, if False, they will not be explicitly recursed but data would be loaded from their meta-data files Returns ------- list of dict extracts and returns a (recursive) list of dataset(s) info at path """ # extract parent info to pass to traverser fsparent = fs_extract(parent.path, parent.repo, basepath=rootds.path) \ if parent else None # (recursively) traverse file tree of current dataset fs = fs_traverse( rootds.path, rootds.repo, subdatasets=list(rootds.subdatasets(result_xfm='relpaths')), render=False, parent=fsparent, # XXX note that here I kinda flipped the notions! recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json ) # BUT if we are recurse_datasets but not recurse_directories # we need to handle those subdatasets then somehow since # otherwise we might not even get to them?! fs['nodes'][0]['size'] = fs['size'] # update self's updated size in nodes sublist too! # add dataset specific entries to its dict rootds_model = GitModel(rootds.repo) fs['tags'] = rootds_model.describe fs['branch'] = rootds_model.branch index_file = opj(rootds.path, '.git', 'index') fs['index-mtime'] = time.strftime( u"%Y-%m-%d %H:%M:%S", time.localtime(getmtime(index_file))) if exists(index_file) else '' # render current dataset lgr.info('Dataset: %s' % rootds.path) fs_render(fs, json=json, ds_path=rootds.path) return fs
def _traverse_handle_subds( subds_rpath, rootds, recurse_datasets, recurse_directories, json): """A helper to deal with the subdataset node - recurse or just pick up may be alrady collected in it web meta """ subds_path = opj(rootds.path, subds_rpath) subds = Dataset(subds_path) subds_json = metadata_locator(path='.', ds_path=subds_path) def handle_not_installed(): # for now just traverse as fs lgr.warning("%s is either not installed or lacks meta-data", subds) subfs = fs_extract(subds_path, rootds, basepath=rootds.path) # but add a custom type that it is a not installed subds subfs['type'] = 'uninitialized' # we need to kick it out from 'children' # TODO: this is inefficient and cruel -- "ignored" should be made # smarted to ignore submodules for the repo #if fs['nodes']: # fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath] return subfs if not subds.is_installed(): subfs = handle_not_installed() elif recurse_datasets: subfs = ds_traverse(subds, json=json, recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, parent=rootds) subfs.pop('nodes', None) #size_list.append(subfs['size']) # else just pick the data from metadata_file of each subdataset else: subfs = None lgr.info(subds.path) if exists(subds_json): with open(subds_json) as data_file: subfs = js.load(data_file) subfs.pop('nodes', None) # remove children subfs['path'] = subds_rpath # reassign the path #size_list.append(subfs['size']) else: # the same drill as if not installed lgr.warning("%s is installed but no meta-data yet", subds) subfs = handle_not_installed() # add URL field return subfs
def _traverse_handle_subds(subds_rpath, rootds, recurse_datasets, recurse_directories, json): """A helper to deal with the subdataset node - recurse or just pick up may be alrady collected in it web meta """ subds_path = opj(rootds.path, subds_rpath) subds = Dataset(subds_path) subds_json = metadata_locator(path='.', ds_path=subds_path) def handle_not_installed(): # for now just traverse as fs lgr.warning("%s is either not installed or lacks meta-data", subds) subfs = fs_extract(subds_path, rootds, basepath=rootds.path) # but add a custom type that it is a not installed subds subfs['type'] = 'uninitialized' # we need to kick it out from 'children' # TODO: this is inefficient and cruel -- "ignored" should be made # smarted to ignore submodules for the repo #if fs['nodes']: # fs['nodes'] = [c for c in fs['nodes'] if c['path'] != subds_rpath] return subfs if not subds.is_installed(): subfs = handle_not_installed() elif recurse_datasets: subfs = ds_traverse(subds, json=json, recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, parent=rootds) subfs.pop('nodes', None) #size_list.append(subfs['size']) # else just pick the data from metadata_file of each subdataset else: subfs = None lgr.info(subds.path) if exists(subds_json): with open(subds_json) as data_file: subfs = js.load(data_file) subfs.pop('nodes', None) # remove children subfs['path'] = subds_rpath # reassign the path #size_list.append(subfs['size']) else: # the same drill as if not installed lgr.warning("%s is installed but no meta-data yet", subds) subfs = handle_not_installed() # add URL field return subfs
def ds_traverse(rootds, parent=None, json=None, recurse_datasets=False, recurse_directories=False, long_=False): """Hierarchical dataset traverser Parameters ---------- rootds: Dataset Root dataset to be traversed parent: Dataset Parent dataset of the current rootds recurse_datasets: bool Recurse into subdatasets of the root dataset recurse_directories: bool Recurse into subdirectories of the current dataset In both of above cases, if False, they will not be explicitly recursed but data would be loaded from their meta-data files Returns ------- list of dict extracts and returns a (recursive) list of dataset(s) info at path """ # extract parent info to pass to traverser fsparent = fs_extract(parent.path, parent.repo, basepath=rootds.path) \ if parent else None # (recursively) traverse file tree of current dataset fs = fs_traverse( rootds.path, rootds.repo, subdatasets=list(rootds.subdatasets(result_xfm='relpaths')), render=False, parent=fsparent, # XXX note that here I kinda flipped the notions! recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json) # BUT if we are recurse_datasets but not recurse_directories # we need to handle those subdatasets then somehow since # otherwise we might not even get to them?! fs['nodes'][0]['size'] = fs[ 'size'] # update self's updated size in nodes sublist too! # add dataset specific entries to its dict rootds_model = GitModel(rootds.repo) fs['tags'] = rootds_model.describe fs['branch'] = rootds_model.branch index_file = opj(rootds.path, '.git', 'index') fs['index-mtime'] = time.strftime( u"%Y-%m-%d %H:%M:%S", time.localtime( getmtime(index_file))) if exists(index_file) else '' # render current dataset lgr.info('Dataset: %s' % rootds.path) fs_render(fs, json=json, ds_path=rootds.path) return fs
def fs_traverse(path, repo, parent=None, subdatasets=None, render=True, recurse_datasets=False, recurse_directories=False, json=None, basepath=None): """Traverse path through its nodes and returns a dictionary of relevant attributes attached to each node Parameters ---------- path: str Path to the directory to be traversed repo: AnnexRepo or GitRepo Repo object the directory belongs too parent: dict Extracted info about parent directory recurse_directories: bool Recurse into subdirectories (note that subdatasets are not traversed) render: bool To render from within function or not. Set to false if results to be manipulated before final render Returns ------- list of dict extracts and returns a (recursive) list of directory info at path does not traverse into annex, git or hidden directories """ subdatasets = subdatasets or [] fs = fs_extract(path, repo, basepath=basepath or path) dataset = Dataset(repo.path) submodules = {sm.path: sm for sm in repo.get_submodules()} # TODO: some submodules might not even have a local empty directory # (git doesn't care about those), so us relying on listdir here and # for _traverse_handle_subds might not work out. # E.g. create-sibling --ui true ... --existing=reconfigure # causes removal of those empty ones on the remote end if isdir(path): # if node is a directory children = [ fs.copy() ] # store its info in its children dict too (Yarik is not sure why, but I guess for .?) # ATM seems some pieces still rely on having this duplication, so left as is # TODO: strip away for node in listdir(path): nodepath = opj(path, node) # Might contain subdatasets, so we should analyze and prepare entries # to pass down... in theory we could just pass full paths may be? strip node_subdatasets = [] is_subdataset = False if isdir(nodepath): node_sep = with_pathsep(node) for subds in subdatasets: if subds == node: # it is the subdataset is_subdataset = True else: # use path_is_subdir if subds.startswith(node_sep): node_subdatasets += [subds[len(node_sep):]] # TODO: it might be a subdir which is non-initialized submodule! # if not ignored, append child node info to current nodes dictionary if is_subdataset: # repo.path is real, so we are doomed (for now at least) # to resolve nodepath as well to get relpath for it node_relpath = relpath(realpath(nodepath), repo.path) subds = _traverse_handle_subds( node_relpath, dataset, recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json) # Enhance it with external url if available submod_url = submodules[node_relpath].url if submod_url and is_datalad_compat_ri(submod_url): subds['url'] = submod_url children.append(subds) elif not ignored(nodepath): # if recursive, create info dictionary (within) each child node too if recurse_directories: subdir = fs_traverse( nodepath, repo, subdatasets=node_subdatasets, parent=None, # children[0], recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json, basepath=basepath or path) subdir.pop('nodes', None) else: # read child metadata from its metadata file if it exists subdir_json = metadata_locator(path=node, ds_path=basepath or path) if exists(subdir_json): with open(subdir_json) as data_file: subdir = js.load(data_file) subdir.pop('nodes', None) # else extract whatever information you can about the child else: # Yarik: this one is way too lean... subdir = fs_extract(nodepath, repo, basepath=basepath or path) # append child metadata to list children.extend([subdir]) # sum sizes of all 1st level children children_size = {} for node in children[1:]: for size_type, child_size in node['size'].items(): children_size[size_type] = children_size.get( size_type, 0) + machinesize(child_size) # update current node sizes to the humanized aggregate children size fs['size'] = children[0]['size'] = \ {size_type: humanize.naturalsize(child_size) for size_type, child_size in children_size.items()} children[0][ 'name'] = '.' # replace current node name with '.' to emulate unix syntax if parent: parent[ 'name'] = '..' # replace parent node name with '..' to emulate unix syntax children.insert( 1, parent ) # insert parent info after current node info in children dict fs['nodes'] = children # add children info to main fs dictionary if render: # render directory node at location(path) fs_render(fs, json=json, ds_path=basepath or path) lgr.info('Directory: %s' % path) return fs
def fs_traverse(path, repo, parent=None, subdatasets=None, render=True, recurse_datasets=False, recurse_directories=False, json=None, basepath=None): """Traverse path through its nodes and returns a dictionary of relevant attributes attached to each node Parameters ---------- path: str Path to the directory to be traversed repo: AnnexRepo or GitRepo Repo object the directory belongs too parent: dict Extracted info about parent directory recurse_directories: bool Recurse into subdirectories (note that subdatasets are not traversed) render: bool To render from within function or not. Set to false if results to be manipulated before final render Returns ------- list of dict extracts and returns a (recursive) list of directory info at path does not traverse into annex, git or hidden directories """ subdatasets = subdatasets or [] fs = fs_extract(path, repo, basepath=basepath or path) dataset = Dataset(repo.path) submodules = {sm.path: sm for sm in repo.get_submodules()} # TODO: some submodules might not even have a local empty directory # (git doesn't care about those), so us relying on listdir here and # for _traverse_handle_subds might not work out. # E.g. create-sibling --ui true ... --existing=reconfigure # causes removal of those empty ones on the remote end if isdir(path): # if node is a directory children = [fs.copy()] # store its info in its children dict too (Yarik is not sure why, but I guess for .?) # ATM seems some pieces still rely on having this duplication, so left as is # TODO: strip away for node in listdir(path): nodepath = opj(path, node) # Might contain subdatasets, so we should analyze and prepare entries # to pass down... in theory we could just pass full paths may be? strip node_subdatasets = [] is_subdataset = False if isdir(nodepath): node_sep = with_pathsep(node) for subds in subdatasets: if subds == node: # it is the subdataset is_subdataset = True else: # use path_is_subdir if subds.startswith(node_sep): node_subdatasets += [subds[len(node_sep):]] # TODO: it might be a subdir which is non-initialized submodule! # if not ignored, append child node info to current nodes dictionary if is_subdataset: # repo.path is real, so we are doomed (for now at least) # to resolve nodepath as well to get relpath for it node_relpath = relpath(realpath(nodepath), repo.path) subds = _traverse_handle_subds( node_relpath, dataset, recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json ) # Enhance it with external url if available submod_url = submodules[node_relpath].url if submod_url and is_datalad_compat_ri(submod_url): subds['url'] = submod_url children.append(subds) elif not ignored(nodepath): # if recursive, create info dictionary (within) each child node too if recurse_directories: subdir = fs_traverse(nodepath, repo, subdatasets=node_subdatasets, parent=None, # children[0], recurse_datasets=recurse_datasets, recurse_directories=recurse_directories, json=json, basepath=basepath or path) subdir.pop('nodes', None) else: # read child metadata from its metadata file if it exists subdir_json = metadata_locator(path=node, ds_path=basepath or path) if exists(subdir_json): with open(subdir_json) as data_file: subdir = js.load(data_file) subdir.pop('nodes', None) # else extract whatever information you can about the child else: # Yarik: this one is way too lean... subdir = fs_extract(nodepath, repo, basepath=basepath or path) # append child metadata to list children.extend([subdir]) # sum sizes of all 1st level children children_size = {} for node in children[1:]: for size_type, child_size in node['size'].items(): children_size[size_type] = children_size.get(size_type, 0) + machinesize(child_size) # update current node sizes to the humanized aggregate children size fs['size'] = children[0]['size'] = \ {size_type: humanize.naturalsize(child_size) for size_type, child_size in children_size.items()} children[0]['name'] = '.' # replace current node name with '.' to emulate unix syntax if parent: parent['name'] = '..' # replace parent node name with '..' to emulate unix syntax children.insert(1, parent) # insert parent info after current node info in children dict fs['nodes'] = children # add children info to main fs dictionary if render: # render directory node at location(path) fs_render(fs, json=json, ds_path=basepath or path) lgr.info('Directory: %s' % path) return fs