def update_dataset(zenodo_dataset, conp_dataset, token): # To update a dataset, we don't know which files have been updated # so we have to remove all existing files and redownload all files # fresh from the latest version of that zenodo dataset dataset_dir = conp_dataset["directory"] dats_dir = os.path.join(dataset_dir, "DATS.json") zenodo_tracker_path = os.path.join(dataset_dir, ".conp-zenodo-crawler.json") # Remove all data and DATS.json files for file_name in os.listdir(dataset_dir): if file_name[0] == "." or file_name == "README.md": continue api.remove(os.path.join(dataset_dir, file_name), check=False) d = api.Dataset(dataset_dir) for bucket in zenodo_dataset["files"]: download_file(bucket, d, dataset_dir) # If DATS.json isn't in downloaded files, create new DATS.json if not os.path.isfile(dats_dir): create_new_dats(dataset_dir, dats_dir, zenodo_dataset) # Add/update .conp-zenodo-crawler.json tracker file create_zenodo_tracker(zenodo_tracker_path, zenodo_dataset) # Save all changes and push to github d.save() d.publish(to="github")
def check_comparison_dir(self): cmpr_path = self.data.comparison_dir dl_dset = datalad.Dataset(str(self.data.tests_data_dir)) if not cmpr_path.exists(): raise ValueError( "The following path does not exist but is required to " f"perform a test:{cmpr_path}.\n You may wish to run the " "test with the --create_sample_output flag or generate " "output for future test sessions with " "--save_sample_output. ") cmpr_files = list(cmpr_path.glob("**/*")) cmpr_files_rel = [f.relative_to(cmpr_path) for f in cmpr_files] files_required = [ f.relative_to(self.data.outdir) for f in self.file_list ] missing_files = [] for f in files_required: if f not in cmpr_files_rel: missing_files.append(str(cmpr_path / f)) if missing_files: m_str = " ".join(missing_files) raise ValueError( "The following files are missing and are required to " f"fully complete the test: {m_str} ") need_data = any(p.is_symlink() and not p.exists() for p in cmpr_files) if need_data: dm.try_data_download([cmpr_path], dl_dset.path, self.data.logger)
def run_tests(tests_dir, **args_dict): check_git_config() test_data = datalad.Dataset(str(tests_dir / "afni_ci_test_data")) if test_data.repo: check_test_data_repo( test_data, ignore_dirty_data=args_dict.get("ignore_dirty_data") ) cmd_args = get_test_cmd_args(**args_dict) cmd_args = configure_parallelism(cmd_args, args_dict.get("use_all_cores")) cmd_args = configure_for_coverage(tests_dir, cmd_args, **args_dict) if args_dict.get("build_dir"): cmd = generate_cmake_command_as_required(tests_dir, args_dict) cmd += f""";ARGS='{' '.join(x for x in cmd_args)}' ninja pytest""" else: cmd = f"""{sys.executable} -m pytest {' '.join(x for x in cmd_args)}""" if args_dict.get("coverage"): # append gcovr to assemble coverage report for C code cmd += f"; gcovr -s --xml -o {tests_dir}/gcovr_output.xml -r {args_dict['build_dir']}/src" # append command for compiling and uploading codecov report # # apparently there is a security issue with codecov, we must # investigate; however, this currently is NOT be being run in # the CircleCI tests ---it probably should not be used, either, # but we should hold a static version of the script that is # reliable (which would required occasional checks for updates) cmd += "; bash -c 'bash <(curl -s https://codecov.io/bash)'" print(f"Executing: {cmd}") res = subprocess.run(cmd, shell=True, env=os.environ.copy()) sys.exit(res.returncode)
def run_tests(tests_dir, **args_dict): check_git_config() test_data = datalad.Dataset(str(tests_dir / "afni_ci_test_data")) if test_data.repo: check_test_data_repo( test_data, ignore_dirty_data=args_dict.get("ignore_dirty_data") ) cmd_args = get_test_cmd_args(**args_dict) cmd_args = configure_parallelism(cmd_args, args_dict.get("use_all_cores")) cmd_args = configure_for_coverage(tests_dir, cmd_args, **args_dict) if args_dict.get("build_dir"): cmd = generate_cmake_command_as_required(tests_dir, args_dict) cmd += f""";ARGS='{' '.join(x for x in cmd_args)}' ninja pytest""" else: cmd = f"""{sys.executable} -m pytest {' '.join(x for x in cmd_args)}""" if args_dict.get("coverage"): # append gcovr to assemble coverage report for C code cmd += f"; gcovr -s --xml -o {tests_dir}/gcovr_output.xml -r {args_dict['build_dir']}/src" # append command for compiling and uploading codecov report cmd += "; bash -c 'bash <(curl -s https://codecov.io/bash)'" print(f"Executing: {cmd}") res = subprocess.run(cmd, shell=True, env=os.environ.copy()) sys.exit(res.returncode)
def unlock(): repo = Repo() project: str = project_name2env(repo.working_dir.split("/")[-1]) token: (str | None) = os.getenv(project + "_ZENODO_TOKEN", None) if not token: raise Exception( f"{project}_ZENODO_TOKEN not found. Cannot inject the Zenodo token into the git-annex urls." ) annex = repo.git.annex if repo.active_branch.name != "master": raise Exception("Dataset repository not set to branch 'master'") if not os.path.isfile(".conp-zenodo-crawler.json"): raise Exception("'.conp-zenodo-crawler.json file not found") with open(".conp-zenodo-crawler.json", "r") as f: metadata = json.load(f) # Ensure correct data if not metadata["restricted"]: raise Exception("Dataset not restricted, no need to unlock") if len(metadata["private_files"]["archive_links"]) == 0 and len( metadata["private_files"]["files"]) == 0: raise Exception("No restricted files to unlock") # Set token in archive link URLs if len(metadata["private_files"]["archive_links"]) > 0: repo.git.checkout("git-annex") changes = False for link in metadata["private_files"]["archive_links"]: for dir_name, dirs, files in os.walk("."): for file_name in files: file_path = os.path.join(dir_name, file_name) if ".git" in file_path: continue with open(file_path, "r") as f: s = f.read() if link in s and "access_token" not in s: changes = True s = s.replace(link, link + "?access_token=" + token) with open(file_path, "w") as f: f.write(s) if changes: repo.git.add(".") repo.git.commit("-m", "Unlock dataset") repo.git.checkout("master") # Set token in non-archive link URLs if len(metadata["private_files"]["files"]) > 0: datalad = api.Dataset(".") for file in metadata["private_files"]["files"]: annex("rmurl", file["name"], file["link"]) annex("addurl", file["link"] + "?access_token=" + token, "--file", file["name"], "--relaxed") datalad.save() print("Done")
def create_new_dataset(dataset, token): dir = os.path.join("projects", dataset["title"]) d = api.Dataset(dir) d.create() d.create_sibling_github(("conp-dataset-" + dataset["title"])[0:100], github_login=token, github_passwd=token) for file_url in dataset["files"]: d.download_url(file_url, archive=True) d.publish(to="github")
def base_dataset(tmpdir_factory): skipif.no_datalad() import datalad.api as dl path = str(tmpdir_factory.mktemp("dataset")) ds = dl.Dataset(path).create(force=True) create_tree(ds.path, {"foo": "foo", "bar": "bar", "d": {"in": "content\n"}}) ds.add(".") ds.repo.tag("root") return ds
def container_dataset(tmpdir_factory): skipif.no_datalad() skipif.no_network() if "datalad_container" not in external_versions: pytest.skip("datalad-container not installed") import datalad.api as dl path = str(tmpdir_factory.mktemp("container_dataset")) ds = dl.Dataset(path).create(force=True) ds.containers_add("dc", url="shub://datalad/datalad-container:testhelper") return ds
def get_tests_data_dir(config_obj): """Get the path to the test data directory. If the test data directory does not exist or is not populated, install with datalad. """ logger = logging.getLogger("Test data setup") tests_data_dir = get_test_data_path(config_obj) # remote should be configured or something is badly amiss... dl_dset = datalad.Dataset(str(tests_data_dir)) if ( dl_dset.is_installed() and "remote.afni_ci_test_data.url" not in dl_dset.config.keys() ): for f in dl_dset.pathobj.glob("**/*"): try: f.chmod(0o700) except FileNotFoundError: # missing symlink, nothing to worry about pass logger.warn("Not sure about test data, perhaps you should try removing...") raise ValueError("Not sure about test data, perhaps you should try removing...") # shutil.rmtree(dl_dset.pathobj) # datalad is required and the datalad repository is used for data. if not (tests_data_dir / ".datalad").exists(): try: global dl_lock dl_lock.acquire() if not (tests_data_dir / ".datalad").exists(): logger.warn("Installing test data") datalad.install( str(tests_data_dir), "https://gin.g-node.org/leej3/afni_ci_test_data", recursive=True, on_failure="stop", ) finally: dl_lock.release() # Needs to be user writeable: some_files = [".git/logs/HEAD"] for f in some_files: data_file = tests_data_dir / f if not data_file.exists(): raise ValueError( f"{f} does not exist (parent existences: {f.parent.exists()}" ) if not os.access(data_file, os.W_OK): raise ValueError(f"{f} is not user writeable ({os.getuid()})") return tests_data_dir
def download_datalad_repo(self, git_ref="master", ignore_dirty_data=False): """ Makes sure datalad repository is downloaded. If a commit is provided this should be checked out. Dirty data (data in the repository that has not been committed is ignored if ignore_dirty_data is set to True. """ if not self.params["data"].get("url"): raise ValueError("A value for url must be provided if the data " "type is datalad_repo ") # Get directory name for repository dl_dset = datalad.Dataset(str(self.params['data']['location'])) get_tests_data_dir(dl_dset, dset_url=self.params['data']['url'])
def create_new_dataset(dataset, token, force, username): repo_title = ("conp-dataset-" + dataset["title"])[0:100] full_repository = "{}/{}".format(username, repo_title) # Check for existing github repo with same name if not verify_repository(username, full_repository, token, dataset, force): return "" dataset_dir = os.path.join("projects", dataset["title"]) d = api.Dataset(dataset_dir) d.create() d.no_annex("DATS.json") d.no_annex("README.md") d.no_annex(".conp-zenodo-crawler.json") d.config.add("datalad.log.timestamp", "true") d.save() r = d.create_sibling_github(repo_title, name="github", github_login=token, github_passwd=token) for bucket in dataset["files"]: download_file(bucket, d, dataset_dir) # Create DATS.json if it doesn't exist if not os.path.isfile(os.path.join(dataset_dir, "DATS.json")): create_new_dats(dataset_dir, os.path.join(dataset_dir, "DATS.json"), dataset) # Create README.md if doesn't exist if not os.path.isfile(os.path.join(dataset_dir, "README.md")): create_readme(dataset, dataset_dir) # Add .conp-zenodo-crawler.json tracker file create_zenodo_tracker( os.path.join(dataset_dir, ".conp-zenodo-crawler.json"), dataset) # Save all changes and push to github d.save() d.publish(to="github") # Add description to Github repo add_description(token, repo_title, username, dataset) update_gitmodules(dataset_dir, r[0][1].replace(token + "@", "")) return d.path
def try_data_download(file_fetch_list, test_data_dir, logger): try: global dl_lock dl_lock.acquire(poll_intervall=1) dl_dset = datalad.Dataset(str(test_data_dir)) # Fetching the data process_for_fetching_data = Process( target=dl_dset.get, kwargs={"path": [str(p) for p in file_fetch_list]} ) # attempts should be timed-out to deal with unpredictable stalls. process_for_fetching_data.start() # logger.debug(f"Fetching data for {test_data_dir}") process_for_fetching_data.join(timeout=60) if process_for_fetching_data.is_alive(): # terminate the process. process_for_fetching_data.terminate() # logger.warn(f"Data fetching timed out for {file_fetch_list}") return False elif process_for_fetching_data.exitcode != 0: # logger.warn(f"Data fetching failed for {file_fetch_list}") return False else: return True except ( IncompleteResultsError, ValueError, CommandError, TimeoutError, Timeout, ) as err: logger.warn( f"Datalad download failure ({type(err)}) for {test_data_dir}. Will try again" ) return False finally: # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening git_config_file = Path(test_data_dir) / ".git" / "config" git_config_file.write_text( git_config_file.read_text().replace("version = 8", "version = 7") ) dl_lock.release() sleep(random.randint(1, 10))
def process_path_obj(path_obj, test_data_dir): """ Convert paths to the pathlib Path type and get the data for test_data_dir, a datalad repository. Args: path_obj (str/pathlib.Path or iterable): Paths as strings/pathlib.Path or non-str iterables with elements of these types can be passed as arguments for conversion to Path objects test_data_dir (pathlib.Path): An existing datalad repository containing the test data. Returns: Path or iterable of Paths: path_obj appropriately converted to pathlib Paths objects with files in test_data_dir data fetched as required. """ dl_dset = datalad.Dataset(str(test_data_dir)) if type(path_obj) == str: path_obj = Path(path_obj) if isinstance(path_obj, Path): check_file_exists(path_obj, test_data_dir) file_fetch_list = generate_fetch_list(path_obj, test_data_dir) dl_dset.get(path=file_fetch_list) return test_data_dir / path_obj elif iter(path_obj): file_fetch_list = [] for input_file in path_obj: input_file = Path(input_file) check_file_exists(input_file, test_data_dir) file_fetch_list = file_fetch_list + generate_fetch_list( input_file, test_data_dir ) dl_dset.get(path=file_fetch_list) return [test_data_dir / p for p in path_obj] else: raise TypeError( "data_paths must contain values that are of type str or a " "non-str iterable type. i.e. list, tuple... " )
def try_data_download(file_fetch_list, test_data_dir, logger): try: global dl_lock dl_lock.acquire(poll_intervall=1) dl_dset = datalad.Dataset(str(test_data_dir)) # Fetching the data process_for_fetching_data = Process( target=dl_dset.get, kwargs={"path": [str(p) for p in file_fetch_list]}) # attempts should be timed-out to deal with unpredictable stalls. process_for_fetching_data.start() # logger.debug(f"Fetching data for {test_data_dir}") process_for_fetching_data.join(timeout=60) if process_for_fetching_data.is_alive(): # terminate the process. process_for_fetching_data.terminate() # logger.warn(f"Data fetching timed out for {file_fetch_list}") return False elif process_for_fetching_data.exitcode != 0: # logger.warn(f"Data fetching failed for {file_fetch_list}") return False else: return True except ( IncompleteResultsError, ValueError, CommandError, TimeoutError, Timeout, ) as err: logger.warn( f"Datalad download failure ({type(err)}) for {test_data_dir}. Will try again" ) return False finally: dl_lock.release() sleep(random.randint(1, 10))
def try_data_download(file_fetch_list, test_data_dir): global lock dl_dset = datalad.Dataset(str(test_data_dir)) attempt_count = 0 lock.acquire() while attempt_count < 2: try: # Fetching the data process_for_fetching_data = Process( target=dl_dset.get, kwargs={"path": [str(p) for p in file_fetch_list]}) # attempts should be timed-out to deal with of unpredictable stalls. process_for_fetching_data.start() process_for_fetching_data.join(timeout=30) if process_for_fetching_data.is_alive(): # terminate the process. process_for_fetching_data.terminate() raise IncompleteResultsError( f"Data fetching timed out for {file_fetch_list}") elif process_for_fetching_data.exitcode != 0: raise ValueError(f"Data fetching failed for {file_fetch_list}") else: lock.release() return except (IncompleteResultsError, CommandError) as e: # Try another loop attempt_count += 1 # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening git_config_file = Path(test_data_dir) / ".git" / "config" git_config_file.write_text(git_config_file.read_text().replace( "version = 8", "version = 7")) continue # datalad download attempts failed pytest.exit( "Datalad download failed 5 times, you may not be connected to the internet" )
def test_orc_datalad_run_container(tmpdir, container_dataset, shell, orc_class): import datalad.api as dl # Avoid the dataset fixture because the subdataset will make its simplistic # cleanup fail. ds = dl.Dataset(op.join(str(tmpdir), "ds")).create() ds.install(path="subds", source=container_dataset) if orc_class == orcs.DataladLocalRunOrchestrator: # We need to have the image locally in order to copy it to the # non-dataset remote. ds.get(op.join("subds", ".datalad", "environments")) with chpwd(ds.path): orc = orc_class( shell, submission_type="local", job_spec={"root_directory": op.join(str(tmpdir), "nm-run"), "outputs": ["out"], "container": "subds/dc", "command_str": 'sh -c "ls / >out"'}) orc.prepare_remote() orc.submit() orc.follow() orc.fetch() assert ds.repo.file_has_content("out") assert "singularity" in open("out").read()
file_paths = [] for data_file_response in specific_data: assert data_file_response[ 'status'] == 'ok', "Requires an 'ok' status, received %s" % ( data_file_response['status']) if data_file_response['type'] == 'file': file_paths.append(data_file_response['path']) return file_paths if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("dataset_dir", help="Directory to add datalad repository into") parser.add_argument("dataset_name", help="Name of datalad repository") parser.add_argument("--get_data", nargs='?', default=None, help="Flag indicating whether to get the data") args = parser.parse_args() print(args) ds_name = os.path.basename(args.dataset_name) ds_path = os.path.join(args.dataset_dir, ds_name) if not os.path.exists(ds_path): ds, _ignore = install_dataset(args.dataset_name, args.dataset_dir) else: ds = api.Dataset(ds_path) subjects, datatypes = get_type_neuro_data(ds_path) if args.get_data != None: file_paths = get_dataset_data(ds, args.get_data)
import sys import datalad.api as dl ds = dl.Dataset(sys.argv[1]) repo = ds.repo # the actual repository should be separately accessible # (via webserver) and is therefore placed into a dedicated # subdataset, which we can publish individually. we can also # further subdivide it for large repos # do not register immediate in the superdataset, but consolidate # in a single commit at the end dl.create(ds.pathobj / 'repo') # destination for the reprepro config (ds.pathobj / 'conf').mkdir() # we want the config to be in git repo.call_annex(['config', '--set', 'annex.largefiles', 'exclude=conf/*']) # establish basic config for repository and reprepro behavior (ds.pathobj / 'conf' / 'options').write_text("""\ # we want repropro to ask for a key passphrase and not just die ask-passphrase # tell reprepro where the repository root is (root of the subdataset) outdir +b/repo """) # the DB files written and read by reprepro need special handling # we need to keep them unlocked (for reprepro to function normally # without datalad), but we also do not want them in git, and we also # cannot fully ignore them: make sure the anything in db/ is tracked # but always unlocked repo.call_annex(['config', '--set', 'annex.addunlocked', 'include=db/*'])
## Datalad has a Python API! One particularly nice aspect of datalad is that it has a Python API, which means that anything you would like to do with datalad in the commandline, can also be run in Python. See the details of the datalad [Python API](http://docs.datalad.org/en/latest/modref.html). For example, suppose you would like to clone a data repository, such as the Localizer dataset. You can run `dl.clone(source=url, path=location)`. Make sure you set `localizer_path` to the location where you would like the Localizer repository installed. import os import glob import datalad.api as dl import pandas as pd localizer_path = '/Users/lukechang/Dropbox/Dartbrains/data/Localizer' dl.clone(source='https://gin.g-node.org/ljchang/Localizer', path=localizer_path) We can now create a dataset instance using `dl.Dataset(path_to_data)`. ds = dl.Dataset(localizer_path) How much of the dataset have we downloaded? We can check the status of the annex using `ds.status(annex='all')`. results = ds.status(annex='all') Looks like it's empty, which makes sense since we only cloned the dataset. Now we need to get some data. Let's start with something small to play with first. Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep. file_list = glob.glob(os.path.join(localizer_path, '*', 'fmriprep', '*', 'func', '*tsv')) file_list.sort()
def ProcessFiles(graph, scan_type, output_directory, project_location, args): ''' This function will essentially cycle through the acquisition objects in the NIDM file loaded into graph and depending on the scan_type will try and copy the image to the output_directory ''' if scan_type == Constants.NIDM_MRI_DIFFUSION_TENSOR.uri: bids_ext = 'dwi' elif scan_type == Constants.NIDM_MRI_ANATOMIC_SCAN.uri: bids_ext = 'anat' elif scan_type == Constants.NIDM_MRI_FUNCTION_SCAN.uri: bids_ext = 'func' # query NIDM document for acquisition entity "subjects" with predicate nidm:hasImageUsageType and object scan_type for acq in graph.subjects(predicate=URIRef( Constants.NIDM_IMAGE_USAGE_TYPE.uri), object=URIRef(scan_type)): # first see if file exists locally. Get nidm:Project prov:Location and append the nfo:Filename of the image # from the acq acquisition entity. If that file doesn't exist try the prov:Location in the func acq # entity and see if we can download it from the cloud # get acquisition uuid from entity uuid temp = graph.objects(subject=acq, predicate=Constants.PROV['wasGeneratedBy']) for item in temp: activity = item # get participant ID with sio:Subject role in anat_acq qualified association part_id = GetParticipantIDFromAcquisition( nidm_file_list=[args.rdf_file], acquisition=activity) # make BIDS sub directory if 'sub' in (part_id['ID'].values)[0]: sub_dir = join(output_directory, (part_id['ID'].values)[0]) else: sub_dir = join(output_directory, "sub-" + (part_id['ID'].values)[0]) sub_filename_base = "sub-" + (part_id['ID'].values)[0] if not os.path.exists(sub_dir): os.makedirs(sub_dir) # make BIDS scan type directory (bids_ext) directory if not os.path.exists(join(sub_dir, bids_ext)): os.makedirs(join(sub_dir, bids_ext)) for filename in graph.objects(subject=acq, predicate=URIRef( Constants.NIDM_FILENAME.uri)): # check if file exists for location in project_location: # if MRI exists in this location then copy and rename if isfile((location[0] + filename).lstrip("file:")): # copy and rename file to be BIDS compliant copyfile((location[0] + filename).lstrip("file:"), join(sub_dir, bids_ext, sub_filename_base + splitext(filename)[1])) continue # if the file wasn't accessible locally, try with the prov:Location in the acq for location in graph.objects(subject=acq, predicate=URIRef( Constants.PROV['Location'])): # try to download the file and rename ret = GetImageFromURL(location) if ret == -1: print( "ERROR! Can't download file: %s from url: %s, trying to copy locally...." % (filename, location)) if "file" in location: location = str(location).lstrip("file:") print("Trying to copy file from %s" % (location)) try: copyfile( location, join(output_directory, sub_dir, bids_ext, basename(filename))) except: print( "ERROR! Failed to find file %s on filesystem..." % location) if not args.no_downloads: try: print( "Running datalad get command on dataset: %s" % location) dl.Dataset(os.path.dirname(location)).get( recursive=True, jobs=1) except: print( "ERROR! Datalad returned error: %s for dataset %s." % (sys.exc_info()[0], location)) GetImageFromAWS(location=location, output_file=join( output_directory, sub_dir, bids_ext, basename(filename)), args=args) else: # copy temporary file to BIDS directory copyfile( ret, join(output_directory, sub_dir, bids_ext, basename(filename))) # if we were able to copy the image file then add the json sidecar file with additional metadata # available in the NIDM file if isfile( join(output_directory, sub_dir, bids_ext, basename(filename))): # get rest of metadata for this acquisition and store in sidecar file if "gz" in basename(filename): image_filename = splitext( splitext(basename(filename))[0])[0] else: image_filename = splitext(basename(filename))[0] AddMetadataToImageSidecar(graph_entity=acq, graph=graph, output_directory=join( output_directory, sub_dir, bids_ext), image_filename=image_filename) # if this is a DWI scan then we should copy over the b-value and b-vector files if bids_ext == 'dwi': # search for entity uuid with rdf:type nidm:b-value that was generated by activity query = """ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX prov: <http://www.w3.org/ns/prov#> PREFIX nidm: <http://purl.org/nidash/nidm#> SELECT DISTINCT ?entity WHERE { ?entity rdf:type <http://purl.org/nidash/nidm#b-value> ; prov:wasGeneratedBy <%s> . }""" % activity # print(query) qres = graph.query(query) for row in qres: bval_entity = str(row[0]) # if the file wasn't accessible locally, try with the prov:Location in the acq for location in graph.objects(subject=URIRef(bval_entity), predicate=URIRef( Constants.PROV['Location'])): # try to download the file and rename ret = GetImageFromURL(location) if ret == -1: print( "ERROR! Can't download file: %s from url: %s, trying to copy locally...." % (filename, location)) if "file" in location: location = str(location).lstrip("file:") print("Trying to copy file from %s" % (location)) try: copyfile( location, join(output_directory, sub_dir, bids_ext, basename(location))) except: print( "ERROR! Failed to find file %s on filesystem..." % location) if not args.no_downloads: try: print( "Running datalad get command on dataset: %s" % location) dl.Dataset( os.path.dirname(location)).get( recursive=True, jobs=1) except: print( "ERROR! Datalad returned error: %s for dataset %s." % (sys.exc_info()[0], location)) GetImageFromAWS( location=location, output_file=join( output_directory, sub_dir, bids_ext, basename(location)), args=args) # search for entity uuid with rdf:type nidm:b-value that was generated by activity query = """ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX prov: <http://www.w3.org/ns/prov#> PREFIX nidm: <http://purl.org/nidash/nidm#> SELECT DISTINCT ?entity WHERE { ?entity rdf:type <http://purl.org/nidash/nidm#b-vector> ; prov:wasGeneratedBy <%s> . }""" % activity # print(query) qres = graph.query(query) for row in qres: bvec_entity = str(row[0]) # if the file wasn't accessible locally, try with the prov:Location in the acq for location in graph.objects(subject=URIRef(bvec_entity), predicate=URIRef( Constants.PROV['Location'])): # try to download the file and rename ret = GetImageFromURL(location) if ret == -1: print( "ERROR! Can't download file: %s from url: %s, trying to copy locally...." % (filename, location)) if "file" in location: location = str(location).lstrip("file:") print("Trying to copy file from %s" % (location)) try: copyfile( location, join(output_directory, sub_dir, bids_ext, basename(location))) except: print( "ERROR! Failed to find file %s on filesystem..." % location) if not args.no_downloads: try: print( "Running datalad get command on dataset: %s" % location) dl.Dataset( os.path.dirname(location)).get( recursive=True, jobs=1) except: print( "ERROR! Datalad returned error: %s for dataset %s." % (sys.exc_info()[0], location)) GetImageFromAWS( location=location, output_file=join( output_directory, sub_dir, bids_ext, basename(location)), args=args)
### Datalad has a Python API! One particularly nice aspect of datalad is that it has a Python API, which means that anything you would like to do with datalad in the commandline, can also be run in Python. See the details of the datalad [Python API](http://docs.datalad.org/en/latest/modref.html). For example, suppose you would like to clone a data repository, such as the Sherlock dataset. You can run `dl.clone(source=url, path=location)`. Make sure you set `sherlock_path` to the location where you would like the Sherlock repository installed. import os import glob import datalad.api as dl import pandas as pd sherlock_path = '/Users/lukechang/Downloads/Sherlock' dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=sherlock_path) We can now create a dataset instance using `dl.Dataset(path_to_data)`. ds = dl.Dataset(sherlock_path) How much of the dataset have we downloaded? We can check the status of the annex using `ds.status(annex='all')`. results = ds.status(annex='all') Looks like it's empty, which makes sense since we only cloned the dataset. Now we need to get some data. Let's start with something small to play with first. Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep. file_list = glob.glob(os.path.join(sherlock_path, 'fmriprep', '*', 'func', '*tsv')) file_list.sort()
%matplotlib inline ### Data This tutorial will be using the **Sherlock** dataset and will require downloading the cropped and denoised **hdf5** files, the annotations file `Sherlock_Segments_1000_NN_2017.xlsx`, and the preprocessed video text file `video_text.npy`. You will want to change `datadir` to wherever you have installed the Sherlock datalad repository (e.g. `~/data`). We will initialize a datalad dataset instance and get the files we need for this tutorial. If you've already downloaded everything, this cell should execute quickly. See the [Download Data Tutorial](http://naturalistic-data.org/features/notebooks/Download_Data.html) for more information about how to install and use datalad. datadir = '/Volumes/Engram/Data/Sherlock' # If dataset hasn't been installed, clone from GIN repository if not os.path.exists(datadir): dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=datadir) # Initialize dataset ds = dl.Dataset(datadir) # Get Cropped & Denoised HDF5 Files result = ds.get(lsdir(os.path.join(datadir, 'fmriprep', '*', 'func', '*crop*hdf5'))) # Get Annotation File result = ds.get(os.path.join(datadir, 'onsets', 'Sherlock_Segments_1000_NN_2017.xlsx')) # Get Preprocessed Video Text result = ds.get(os.path.join(datadir, 'stimuli', 'video_text.npy')) ## ROI responses while viewing Sherlock Following the [functional alignment tutorial](http://naturalistic-data.org/features/notebooks/Functional_Alignment.html), we'll select out voxels in early visual cortex from the *Sherlock* dataset. We'll also examine primary auditory cortex and motor cortex responses. Then we'll apply the HyperTools pipeline to the dataset and visualize the responses within each ROI as a 3D image. Note you could also work with the Average ROI csv files as we did with the Dynamic Correlation tutorial. Here, we will load the full dataset and manually extract ROIs. mask = Brain_Data('https://neurovault.org/media/images/8423/k50_2mm.nii.gz')
data_dir_paranoia = '/Volumes/Engram/Data/Paranoia/' data_dir_sherlock = '/Volumes/Engram/Data/Sherlock/' paranoia_audio = os.path.join(data_dir_paranoia, 'stimuli', 'stimuli_story1_audio.wav') sherlock_video = os.path.join(data_dir_sherlock, 'stimuli','stimuli_Sherlock.m4v') # If datasets haven't been installed, clone from GIN repository if not os.path.exists(data_dir_paranoia): dl.clone(source='https://gin.g-node.org/ljchang/Paranoia', path=data_dir_paranoia) if not os.path.exists(data_dir_sherlock): dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=data_dir_sherlock) # Initialize dataset ds_paranoia = dl.Dataset(data_dir_paranoia) ds_sherlock = dl.Dataset(data_dir_sherlock) # Get Paranoia story result = ds_paranoia.get(paranoia_audio) # Get Sherlock video result = ds_sherlock.get(sherlock_video) ## Getting Started The best way to see what *pliers* can offer is to jump right into an example. ### Example 1: Audio RMS
plt.rc('xtick', labelsize=smallsize); plt.rc('ytick', labelsize=smallsize); plt.rc('legend', fontsize=mediumsize) plt.rc('figure', titlesize=largesize); plt.rc('axes', labelsize=mediumsize); plt.rc('axes', titlesize=mediumsize) ### Data This tutorial will be using the **Sherlock** dataset and will require downloading the cropped & denoised **.nii.gz** files. The tutorial will be mostly working with spatial patterns within the angular gyrus, so if you would like to get started with the tutorial right away without waiting for all of the nifti files to load, you can download the masked data as a `.npy` file from [figshare: Sherlock data for OHBM](https://figshare.com/articles/Sherlock_data_for_OHBM/12436955). You will want to change `data_dir` to wherever you have installed the Sherlock datalad repository. We will initialize a datalad dataset instance and get the files we need for this tutorial. If you've already downloaded everything, you can skip this cell. See the [Download Data Tutorial](http://naturalistic-data.org/features/notebooks/Download_Data.html) for more information about how to install and use datalad. data_dir = '/Volumes/Emily_MyPassport2TB/Sherlock/' # If dataset hasn't been installed, clone from GIN repository if not os.path.exists(data_dir): dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=data_dir) # Initialize dataset ds = dl.Dataset(data_dir) # Get Denoised nifti Files result = ds.get(glob.glob(os.path.join(data_dir, 'fmriprep', '*', 'func', f'*denoise*nii.gz'))) ### 0. Load Angular Gyrus data From the angular gyrus (area PG from (Eickhoff et al., 2005)), we'll load movie data from all subjects, and recall data from one subject. Subjects were watching the first hour of [A Study in Pink](https://en.wikipedia.org/wiki/A_Study_in_Pink) (here we are loading only the first half of this data), and then freely recalled the narrative. Please refer to [Chen et al. (2017)](https://doi.org/10.1038/nn.4450) to learn more about this dataset. We can load this data from the nii files by applying an angular gyrus mask, which we then cache into a numpy file to speed up loading in the future. If you'd like to skip this nii-loading step (which can be slow), you can download the npy files from [figshare: Sherlock data for OHBM](https://figshare.com/articles/Sherlock_data_for_OHBM/12436955). mask = Brain_Data('https://neurovault.org/media/images/8423/AG_mask.nii.gz').to_nifti() if (not os.path.exists(data_dir + 'Sherlock_AG_movie.npy') or not os.path.exists(data_dir + 'Sherlock_AG_recall.npy')): movie = []
import os import glob import datalad.api as dl import pandas as pd localizer_path = '/Users/lukechang/Dropbox/Dartbrains/data/Localizer' dl.clone(source='https://gin.g-node.org/ljchang/Localizer', path=localizer_path) # We can now create a dataset instance using `dl.Dataset(path_to_data)`. # In[6]: ds = dl.Dataset(localizer_path) # How much of the dataset have we downloaded? We can check the status of the annex using `ds.status(annex='all')`. # In[12]: results = ds.status(annex='all') # Looks like it's empty, which makes sense since we only cloned the dataset. # # Now we need to get some data. Let's start with something small to play with first. # # Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep.
def prepare_remote(self): """Prepare dataset sibling on remote. """ if not self.ds.repo.get_active_branch(): # publish() fails when HEAD is detached. raise OrchestratorError( "You must be on a branch to use the {} orchestrator".format( self.name)) if not self.session.exists(self.root_directory): self.session.mkdir(self.root_directory, parents=True) resource = self.resource session = self.session inputs = list(self.get_inputs()) if isinstance(session, SSHSession): if resource.key_filename: dl_version = external_versions["datalad"] if dl_version < "0.11.3": # Connecting will probably fail because `key_filename` is # set, but we have no way to tell DataLad about it. lgr.warning( "DataLad version %s detected. " "0.11.3 or greater is required to use an " "identity file not specified in ~/.ssh/config", dl_version) # Make the identity file available to 'datalad sshrun' even if # it is not configured in .ssh/config. This is particularly # important for AWS keys. os.environ["DATALAD_SSH_IDENTITYFILE"] = resource.key_filename from datalad import cfg cfg.reload(force=True) sshurl = _format_ssh_url( resource.user, # AWS resource does not have host attribute. getattr(resource, "host", None) or session.connection.host, getattr(resource, "port", None), self.working_directory) # TODO: Add one level deeper with reckless clone per job to deal # with concurrent jobs? if not session.exists(self.working_directory): remotes = self.ds.repo.get_remotes() if resource.name in remotes: raise OrchestratorError( "Remote '{}' unexpectedly exists. " "Either delete remote or rename resource.".format( resource.name)) self.ds.create_sibling(sshurl, name=resource.name, recursive=True) since = None # Avoid since="" for non-existing repo. else: remote_branch = "{}/{}".format( resource.name, self.ds.repo.get_active_branch()) if self.ds.repo.commit_exists(remote_branch): since = "" else: # If the remote branch doesn't exist yet, publish will fail # with since="". since = None from datalad.support.exceptions import IncompleteResultsError try: self.ds.publish(to=resource.name, since=since, recursive=True) except IncompleteResultsError: raise OrchestratorError( "'datalad publish' failed. Try running " "'datalad update -s {} --merge --recursive' first".format( resource.name)) self._fix_up_dataset() if inputs: lgr.info("Making inputs available") try: # TODO: Whether we try this `get` should be configurable. self._execute_in_wdir("datalad get {}".format( # FIXME: This should use something like # execute_command_batch. " ".join(map(shlex_quote, inputs)))) except OrchestratorError: # Should use --since for existing repo, but it doesn't seem # to sync wrt content. self.ds.publish(to=resource.name, path=inputs, recursive=True) elif resource.type == "shell": import datalad.api as dl if not session.exists(self.working_directory): dl.install(self.working_directory, source=self.ds.path) self.session.execute_command("git push '{}' HEAD:{}-base".format( self.working_directory, self.job_refname)) self._checkout_target() if inputs: installed_ds = dl.Dataset(self.working_directory) installed_ds.get(inputs) else: # TODO: Handle more types? raise OrchestratorError("Unsupported resource type {}".format( resource.type)) if not session.exists(self.meta_directory): session.mkdir(self.meta_directory, parents=True)
#!/usr/bin/env python3 import os.path as op import sys import xml.dom.minidom import datalad.api as dl from datalad.support.network import download_url ds = dl.Dataset(op.dirname(op.dirname(op.realpath(__file__)))) if 'datalad' not in ds.repo.get_remotes(): from datalad.customremotes.base import init_datalad_remote init_datalad_remote(ds.repo, 'datalad', autoenable=True) # doc = xml.dom.minidom.parse('/tmp/outi-7T.xml') topurl = 'https://db.humanconnectome.org/data/archive/projects/HCP_Resources/resources/7T_Movies/' doc = xml.dom.minidom.parseString(download_url(topurl)) files = [{f: e.getAttribute(f) for f in ('ID', 'URI', 'digest', 'name')} for e in doc.getElementsByTagName("cat:entry")] # from pprint import pprint # pprint(files) added = list( ds.addurls(files, topurl + 'files/{URI}', '{URI}', fast=False, save=False)) print(f"Processed {len(added)} entries")