def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1, 100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl - len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def test_get_missing(path): repo = GitRepo(path, create=True) os.makedirs(op.join(path, 'deep')) with open(op.join(path, 'test1'), 'w') as f: f.write('some') with open(op.join(path, 'deep', 'test2'), 'w') as f: f.write('some more') # no files tracked yet, so nothing changed eq_(repo.get_changed_files(), []) repo.add('.') # still no differences between worktree and staged eq_(repo.get_changed_files(), []) eq_(set(repo.get_changed_files(staged=True)), {'test1', op.join('deep', 'test2')}) eq_(set(repo.get_changed_files(staged=True, diff_filter='AD')), {'test1', op.join('deep', 'test2')}) eq_(repo.get_changed_files(staged=True, diff_filter='D'), []) repo.commit() eq_(repo.get_changed_files(), []) eq_(repo.get_changed_files(staged=True), []) ok_clean_git(path, annex=False) unlink(op.join(path, 'test1')) eq_(repo.get_missing_files(), ['test1']) rmtree(op.join(path, 'deep')) eq_(sorted(repo.get_missing_files()), [op.join('deep', 'test2'), 'test1']) # nothing is actually known to be deleted eq_(repo.get_deleted_files(), []) # do proper removal repo.remove(op.join(path, 'test1')) # no longer missing eq_(repo.get_missing_files(), [op.join('deep', 'test2')]) # but deleted eq_(repo.get_deleted_files(), ['test1'])
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import os import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl-len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def test_rotree(d): d2 = opj(d, 'd1', 'd2') # deep nested directory f = opj(d2, 'f1') os.makedirs(d2) with open(f, 'w') as f_: f_.write("LOAD") with swallow_logs(): ar = AnnexRepo(d2) rotree(d) # we shouldn't be able to delete anything UNLESS in "crippled" situation: # root, or filesystem is FAT etc # Theoretically annex should declare FS as crippled when ran as root, but # see http://git-annex.branchable.com/bugs/decides_that_FS_is_crippled_ # under_cowbuilder___40__symlinks_supported_etc__41__/#comment-60c3cbe2710d6865fb9b7d6e247cd7aa # so explicit 'or' if not (ar.is_crippled_fs() or (os.getuid() == 0)): assert_raises(OSError, os.unlink, f) # OK to use os.unlink assert_raises(OSError, unlink, f) # and even with waiting and trying! assert_raises(OSError, shutil.rmtree, d) # but file should still be accessible with open(f) as f_: eq_(f_.read(), "LOAD") # make it RW rotree(d, False) unlink(f) shutil.rmtree(d)
def link_file_load(src, dst, dry_run=False): """Just a little helper to hardlink files's load """ dst_dir = op.dirname(dst) if not op.exists(dst_dir): os.makedirs(dst_dir) if op.lexists(dst): lgr.log(9, "Destination file %(dst)s exists. Removing it first", locals()) # TODO: how would it interact with git/git-annex unlink(dst) lgr.log(9, "Hardlinking %(src)s under %(dst)s", locals()) src_realpath = op.realpath(src) try: os.link(src_realpath, dst) except (OSError, AttributeError) as e: # we need to catch OSError too, because Python's own logic # of not providing link() where it is known to be unsupported # (e.g. Windows) will not cover scenarios where a particular # filesystem simply does not implement it on an otherwise # sane platform (e.g. exfat on Linux) lgr.warning("Linking of %s failed (%s), copying file" % (src, e)) shutil.copyfile(src_realpath, dst) shutil.copystat(src_realpath, dst) else: lgr.log(2, "Hardlinking finished")
def test_get_git_dir(path): # minimal, only missing coverage assert_raises(RuntimeError, GitRepo.get_git_dir, path) srcpath = opj(path, 'src') targetpath = opj(path, 'target') targetgitpath = opj(targetpath, '.git') os.makedirs(srcpath) os.makedirs(targetpath) if not on_windows: # with PY3 would also work with Windows 6+ os.symlink(srcpath, targetgitpath) eq_(srcpath, GitRepo.get_git_dir(targetpath)) # cleanup for following test unlink(targetgitpath) with open(targetgitpath, 'w') as f: f.write('gitdir: {}'.format(srcpath)) eq_(srcpath, GitRepo.get_git_dir(targetpath))
def add_urls(rows, ifexists=None, options=None): """Call `git annex addurl` using information in `rows`. """ for row in rows: filename_abs = row["filename_abs"] ds, filename = row["ds"], row["ds_filename"] lgr.debug("Adding metadata to %s in %s", filename, ds.path) if os.path.exists(filename_abs) or os.path.islink(filename_abs): if ifexists == "skip": yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, status="notneeded") continue elif ifexists == "overwrite": lgr.debug("Removing %s", filename_abs) unlink(filename_abs) else: lgr.debug("File %s already exists", filename_abs) try: out_json = ds.repo.add_url_to_file(filename, row["url"], batch=True, options=options) except AnnexBatchCommandError as exc: yield get_status_dict(action="addurls", ds=ds, type="file", path=filename_abs, message=exc_str(exc), status="error") continue # In the case of an error, the json object has file=None. if out_json["file"] is None: out_json["file"] = filename_abs yield annexjson2result(out_json, ds, action="addurls", type="file", logger=lgr)
def __call__(dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset ) if dataset.repo.is_dirty(): raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename ) archive_out = next( export_archive( dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator" ) ) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article" ): article = figshare.create_article( title=os.path.basename(dataset.path) ) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article ) article_id = article['id'] else: article_id = int(ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())) )) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id ) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ] ) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit=False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format( file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr )
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_status('notneeded', ds.save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.subdatasets(recursive=True, fulfilled=True, result_xfm='paths'), [subds.path, subsubds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_result_values_equal( ds.save(result_filter=is_ok_dataset), 'path', [ds.path]) # make the new file known to its dataset ds.add(newfile_name, save=False) # but remains dirty because of the uncommited file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_status('notneeded', ds.save()) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_result_values_equal( ds.save(recursive=True, result_filter=is_ok_dataset), 'path', [subsubds.path, subds.path, ds.path]) # at this point the entire tree is clean ok_clean_git(ds.path) states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] # now we save recursively, nothing should happen res = ds.save(recursive=True) # we do not get any report from a subdataset, because we detect at the # very top that the entire tree is clean assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save', path=ds.path) # now we introduce new files all the way down create_tree(subsubds.path, {"mike1": 'mike1'}) # because we cannot say from the top if there is anything to do down below, # we have to traverse and we will get reports for all dataset, but there is # nothing actually saved res = ds.save(recursive=True) assert_result_count(res, 3) assert_status('notneeded', res) subsubds_indexed = subsubds.repo.get_indexed_files() assert_not_in('mike1', subsubds_indexed) assert_equal(states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]) unlink(opj(subsubds.path, 'mike1')) ok_clean_git(ds.path) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_status('notneeded', ds.save()) # no recursive assert_status('notneeded', ds.save()) # an explicit target saves only the corresponding dataset assert_result_values_equal( save(path=[testfname]), 'path', [subsubds.path]) # plain recursive without any files given will save the beast assert_result_values_equal( ds.save(recursive=True, result_filter=is_ok_dataset), 'path', [subds.path, ds.path]) # there is nothing else to save assert_status('notneeded', ds.save(recursive=True)) ok_clean_git(ds.path) # one more time and check that all datasets in the hierarchy are not # contaminated with untracked files states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_status('notneeded', ds.save(recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_equal(old, new) assert ds.repo.dirty unlink(opj(ds.path, testfname)) ok_clean_git(ds.path) # now let's check saving "upwards" create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) old_states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] subsubds.save(message="savingtestmessage", super_datasets=True) # this save actually didn't save anything in subsub (or anywhere), # because there were only untracked bits pending for old, new in zip(old_states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]): assert_equal(old, new) # but now we are saving this untracked bit specifically subsubds.save(message="savingtestmessage", path=['testnew2'], super_datasets=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal(next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') # and if we try to save while being within that subsubds path subsubds.unlock('testnew2') create_tree(subsubds.path, {"testnew2": 'smth2'}) # trying to replicate https://github.com/datalad/datalad/issues/1540 subsubds.save(message="saving new changes", all_updated=True) # no super with chpwd(subds.path): # no explicit dataset is provided by path is provided save(path=['subsub'], message='saving sub', super_datasets=True) # super should get it saved too assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving sub')
def test_recursive_save(path): ds = Dataset(path).create() # nothing to save assert_status('notneeded', ds._save()) subds = ds.create('sub') # subdataset presence already saved ok_clean_git(ds.path) subsubds = subds.create('subsub') assert_equal( ds.subdatasets(recursive=True, fulfilled=True, result_xfm='paths'), [subds.path, subsubds.path]) newfile_name = opj(subsubds.path, 'test') with open(newfile_name, 'w') as f: f.write('some') # saves the status change of the subdataset due to the subsubdataset addition assert_result_values_equal( ds._save(result_filter=is_ok_dataset), 'path', [ds.path]) # make the new file known to its dataset ds.add(newfile_name, save=False) # but remains dirty because of the uncommited file down below assert ds.repo.dirty # auto-add will save nothing deep down without recursive assert_status('notneeded', ds._save()) assert ds.repo.dirty # with recursive pick up the change in subsubds assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subsubds.path, subds.path, ds.path]) # at this point the entire tree is clean ok_clean_git(ds.path) states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] # now we save recursively, nothing should happen res = ds._save(recursive=True) # we do not get any report from a subdataset, because we detect at the # very top that the entire tree is clean assert_result_count(res, 1) assert_result_count(res, 1, status='notneeded', action='save', path=ds.path) # now we introduce new files all the way down create_tree(subsubds.path, {"mike1": 'mike1'}) # because we cannot say from the top if there is anything to do down below, # we have to traverse and we will get reports for all dataset, but there is # nothing actually saved res = ds._save(recursive=True) assert_result_count(res, 3) assert_status('notneeded', res) subsubds_indexed = subsubds.repo.get_indexed_files() assert_not_in('mike1', subsubds_indexed) assert_equal(states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]) unlink(opj(subsubds.path, 'mike1')) ok_clean_git(ds.path) # modify content in subsub and try saving testfname = newfile_name subsubds.unlock(testfname) with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') # the following should all do nothing # no auto_add assert_status('notneeded', ds._save()) # no recursive assert_status('notneeded', ds._save()) # an explicit target saves only the corresponding dataset assert_result_values_equal( save(path=[testfname]), 'path', [subsubds.path]) # plain recursive without any files given will save the beast assert_result_values_equal( ds._save(recursive=True, result_filter=is_ok_dataset), 'path', [subds.path, ds.path]) # there is nothing else to save assert_status('notneeded', ds._save(recursive=True)) ok_clean_git(ds.path) # one more time and check that all datasets in the hierarchy are not # contaminated with untracked files states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] testfname = opj('sub', 'subsub', 'saveme2') with open(opj(ds.path, testfname), 'w') as f: f.write('I am in here!') assert_status('notneeded', ds._save(recursive=True)) newstates = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] for old, new in zip(states, newstates): assert_equal(old, new) assert ds.repo.dirty unlink(opj(ds.path, testfname)) ok_clean_git(ds.path) # now let's check saving "upwards" create_tree(subds.path, {"testnew": 'smth', "testadded": "added"}) subds.repo.add("testadded") indexed_files = subds.repo.get_indexed_files() assert subds.repo.dirty assert ds.repo.dirty assert not subsubds.repo.dirty create_tree(subsubds.path, {"testnew2": 'smth'}) assert subsubds.repo.dirty # and indexed files didn't change assert_equal(indexed_files, subds.repo.get_indexed_files()) ok_clean_git(subds.repo, untracked=['testnew'], index_modified=['subsub'], head_modified=['testadded']) old_states = [d.repo.get_hexsha() for d in (ds, subds, subsubds)] subsubds._save(message="savingtestmessage", super_datasets=True) # this save actually didn't save anything in subsub (or anywhere), # because there were only untracked bits pending for old, new in zip(old_states, [d.repo.get_hexsha() for d in (ds, subds, subsubds)]): assert_equal(old, new) # but now we are saving this untracked bit specifically subsubds._save(message="savingtestmessage", path=['testnew2'], super_datasets=True) ok_clean_git(subsubds.repo) # but its super should have got only the subsub saved # not the file we created ok_clean_git(subds.repo, untracked=['testnew'], head_modified=['testadded']) # check commits to have correct messages # there are no more dedicated superdataset-save commits anymore, because # superdatasets get saved as part of the processed hierarchy and can contain # other parts in the commit (if so instructed) assert_equal(next(subsubds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(subds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'savingtestmessage') # and if we try to save while being within that subsubds path subsubds.unlock('testnew2') create_tree(subsubds.path, {"testnew2": 'smth2'}) # trying to replicate https://github.com/datalad/datalad/issues/1540 subsubds._save(message="saving new changes", all_updated=True) # no super with chpwd(subds.path): # no explicit dataset is provided by path is provided save(path=['subsub'], message='saving sub', super_datasets=True) # super should get it saved too assert_equal(next(ds.repo.get_branch_commits('master')).message.rstrip(), 'saving sub')