def test_get_content_info(path): repo = GitRepo(path) assert_equal(repo.get_content_info(), {}) # an invalid reference causes an exception assert_raises(ValueError, repo.get_content_info, ref='HEAD') ds = get_convoluted_situation(path) repopath = ds.repo.pathobj assert_equal(ds.repo.pathobj, repopath) assert_equal(ds.pathobj, ut.Path(path)) # verify general rules on fused info records that are incrementally # assembled: for git content info, amended with annex info on 'HEAD' # (to get the last committed stage and with it possibly vanished # content), and lastly annex info wrt to the present worktree, to # also get info on added/staged content # this fuses the info reported from # - git ls-files # - git annex findref HEAD # - git annex find --include '*' for f, r in ds.repo.annexstatus().items(): if f.match('*_untracked'): assert (r.get('gitshasum', None) is None) if f.match('*_deleted'): assert (not f.exists() and not f.is_symlink() is None) if f.match('subds_*'): assert (r['type'] == 'dataset' if r.get('gitshasum', None) else 'directory') if f.match('file_*'): # which one exactly depends on many things assert_in(r['type'], ('file', 'symlink')) if f.match('file_ingit*'): assert (r['type'] == 'file') elif '.datalad' not in f.parts and not f.match('.git*') and \ r.get('gitshasum', None) and not f.match('subds*'): # this should be known to annex, one way or another # regardless of whether things add deleted or staged # or anything in between assert_in('key', r, f) assert_in('keyname', r, f) assert_in('backend', r, f) assert_in('bytesize', r, f) # no duplication with path assert_not_in('file', r, f) # query full untracked report res = ds.repo.get_content_info() assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # query for compact untracked report res = ds.repo.get_content_info(untracked='normal') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_in(repopath.joinpath('dir_untracked'), res) # query no untracked report res = ds.repo.get_content_info(untracked='no') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # git status integrity status = ds.repo.status() for t in ('subds', 'file'): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): if t == 'subds' and 'ingit' in s or 'dropped' in s: # invalid combination continue if t == 'subds' and s == 'deleted': # same as subds_unavailable -> clean continue p = repopath.joinpath(l, '{}_{}'.format(t, s)) assert p.match('*_{}'.format(status[p]['state'])), p if t == 'subds': assert_in(status[p]['type'], ('dataset', 'directory'), p) else: assert_in(status[p]['type'], ('file', 'symlink'), p) # git annex status integrity annexstatus = ds.repo.annexstatus() for t in ('file', ): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): p = repopath.joinpath(l, '{}_{}'.format(t, s)) if s in ('untracked', 'ingit_clean', 'ingit_modified'): # annex knows nothing about these things assert_not_in('key', annexstatus[p]) continue assert_in('key', annexstatus[p]) # dear future, # if the next one fails, git-annex might have changed the # nature of the path that are being reported by # `annex find --json` # when this was written `hashir*` was a native path, but # `file` was a POSIX path assert_equal(annexstatus[p]['has_content'], 'dropped' not in s) # check the different subds evaluation modes someds = Dataset(ds.pathobj / 'subds_modified' / 'someds') dirtyds_path = someds.pathobj / 'dirtyds' assert_not_in('state', someds.repo.status(eval_submodule_state='no')[dirtyds_path]) assert_equal( 'clean', someds.repo.status( eval_submodule_state='commit')[dirtyds_path]['state']) assert_equal( 'modified', someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state'])
def _yield_metadata_records(aggdspath, agg_record, query_paths, reporton, parentds): dsmeta = None if reporton in ('datasets', 'all', 'jsonld') \ and 'dataset_info' in agg_record: # we do not need path matching here, we already know # that something in this dataset is relevant objfile = text_type(agg_record['dataset_info']) # TODO if it doesn't exist but is requested say impossible? dsmeta = json_load(objfile) info = dict( path=text_type(aggdspath), status='ok', type='dataset', metadata=dsmeta, # some things that should be there, but maybe not # -- make optional to be more robust dsid=agg_record.get('id', None), refcommit=agg_record.get('refcommit', None), datalad_version=agg_record.get('datalad_version', None), ) if parentds: info['parentds'] = parentds yield info if reporton in ('files', 'all', 'jsonld') and 'content_info' in agg_record: objfile = text_type(agg_record['content_info']) # TODO if it doesn't exist but is requested say impossible? for file_record in json_streamload(objfile): if 'path' not in file_record: # pragma: no cover yield dict( status='error', message=("content metadata contains record " "without a 'path' specification: %s", agg_record), type='dataset', path=aggdspath, ) continue # absolute path for this file record # metadata record always uses POSIX conventions fpath = aggdspath / ut.PurePosixPath(file_record['path']) if not any(p == fpath or p in fpath.parents for p in query_paths): # ignore any file record that doesn't match any query # path (direct hit or git-annex-like recursion within a # dataset) continue if dsmeta is not None and \ '@context' in dsmeta and \ '@context' not in file_record: file_record['@context'] = dsmeta['@context'] info = dict( path=text_type(fpath), parentds=text_type(aggdspath), status='ok', type='file', metadata={ k: v for k, v in iteritems(file_record) if k not in ('path', ) }, # really old extracts did not have 'id' dsid=agg_record.get('id', None), refcommit=agg_record['refcommit'], datalad_version=agg_record['datalad_version'], ) yield info