def load_yaml_config(config_filename, aws_input_creds): if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) prefix = config_filename.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) bucket.download_file(prefix, '/scratch/'+os.path.basename(config_filename)) config_filename = '/scratch/'+os.path.basename(config_filename) config_filename = os.path.realpath(config_filename) if os.path.isfile(config_filename): with open(config_filename,'r') as infd: config_data = yaml.load(infd) return(config_data)
def download_outputs(path_prefix, creds_path, bucket_name, qap_type, \ download_to): import pickle from indi_aws import fetch_creds from indi_aws.aws_utils import s3_download src_list = [] bucket = fetch_creds.return_bucket(creds_path, bucket_name) if qap_type == "anat_spatial": search_for = "anatomical_spatial" elif qap_type == "func_spatial": search_for = "functional_spatial" elif qap_type == "func_temporal": search_for = "functional_temporal" for k in bucket.list(prefix=path_prefix): k_name = str(k.name) if (search_for in k_name) and (".csv" in k_name): src_list.append(k_name) s3_download(bucket, src_list, download_to)
def load_yaml_config(config_filename, aws_input_creds): if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) prefix = config_filename.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) downloaded_config = '/tmp/' + os.path.basename(config_filename) bucket.download_file(prefix, downloaded_config) config_filename = downloaded_config config_filename = os.path.realpath(config_filename) try: with open(config_filename, 'r') as f: config_data = yaml.load(f) return config_data except IOError: print("Error! Could not find config file {0}".format(config_filename)) raise
def load_yaml_config(config_filename, aws_input_creds): if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) prefix = config_filename.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) bucket.download_file(prefix, '/scratch/' + os.path.basename(config_filename)) config_filename = '/scratch/' + os.path.basename(config_filename) config_filename = os.path.realpath(config_filename) if os.path.isfile(config_filename): with open(config_filename, 'r') as infd: config_data = yaml.load(infd) return (config_data)
def pull_NIFTI_file_list_from_s3(s3_directory, s3_creds): import os try: from indi_aws import fetch_creds except: err = "\n\n[!] You need the INDI AWS package installed in order to " \ "pull from an S3 bucket. Try 'pip install indi_aws'\n\n" raise Exception(err) s3_list = [] s3_path = s3_directory.replace("s3://","") bucket_name = s3_path.split("/")[0] bucket_prefix = s3_path.split(bucket_name + "/")[1] bucket = fetch_creds.return_bucket(s3_creds, bucket_name) # Build S3-subjects to download # maintain the "s3://<bucket_name>" prefix!! print("Gathering file paths from {0}\n".format(s3_directory)) for bk in bucket.objects.filter(Prefix=bucket_prefix): if ".nii" in str(bk.key): s3_list.append(os.path.join("s3://", bucket_name, str(bk.key))) if len(s3_list) == 0: err = "\n\n[!] No filepaths were found given the S3 path provided!" \ "\n\n" raise Exception(err) return s3_list
def list_files(path, s3_creds_path=None): if path.startswith('s3://'): pieces = path[5:].split('/') bucket_name, path = pieces[0], '/'.join(pieces[1:]) bucket = fetch_creds.return_bucket(s3_creds_path, bucket_name) return [ 's3://%s/%s' % (bucket, obj['Key']) for obj in bucket.objects.filter(Prefix=path) ] else: return list(glob.glob(path + '/*'))
def gather_nifti_file_paths(dataset_folder, creds_path=None): import os s3_prefix = "s3://s3.amazonaws.com" file_path_list = [] # paths that include s3:// are assumed to live in AWS Simple Storage Service if "s3://" in dataset_folder: try: from indi_aws import fetch_creds except Exception as e: print "Error ({0:s}): Could not import indi_aws package".format( e.message) raise (e) try: s3_path_vals = (dataset_folder.replace(s3_prefix, "")).split('/') bucket_name = s3_path_vals[1] data_path = "/".join(s3_path_vals[2:]) except Exception as e: print "Error ({0:s}): There is a problem with s3 path {1:s}".format( e.message, dataset_folder) raise (e) print "Extracting NIfTI paths from s3 bucket {0:s}::{1:s})".format( bucket_name, data_path) bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Build S3-subjects to download for bk in bucket.objects.filter(Prefix=data_path): if str(bk.key).endswith(".nii") or str(bk.key).endswith(".nii.gz"): file_path_list.append( os.path.join(s3_prefix, bucket_name, str(bk.key))) else: print "Extracting NIfTI paths from local filesystem" for root, folders, files in os.walk(os.path.abspath(dataset_folder)): for filename in files: if filename.endswith('.nii') or filename.endswith('.nii.gz'): file_path_list.append(os.path.join(root, filename)) if not file_path_list: raise Exception("Did not find any nifti files in %s" % (dataset_folder)) return (file_path_list)
def download_cpac_resources_from_s3(local_base): ''' Function to download the CPAC testing resources directory from S3 Parameters ---------- local_base : string the local directory to save the 'cpac_resources' contents ''' # Import packages import os from indi_aws import aws_utils, fetch_creds # Init variables bucket_name = default_bucket_name() resource_folder = 'cpac_resources' s3_prefix = os.path.join('data/test_resources', resource_folder) # Get bucket object bucket = fetch_creds.return_bucket(None, bucket_name) # Gather files from bucket for obj in bucket.objects.filter(Prefix=s3_prefix): bkey = obj.key # If the object is just a folder, move on to next object if bkey.endswith('/'): continue # Form local path from key local_path = os.path.join(local_base, bkey.split(resource_folder)[-1].lstrip('/')) # Make download directories local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file if it doesn't exist if not os.path.exists(local_path): bucket.download_file(bkey, local_path, Callback=aws_utils.ProgressPercentage(obj)) # Print done print 'CPAC resources folder in %s is complete!' % local_base
def download_single_s3_path(s3_path, cfg_dict): """Download a single file from an AWS s3 bucket. :type s3_path: str :param s3_path: An "s3://" pre-pended path to a file stored on an Amazon AWS s3 bucket. :type cfg_dict: dictionary :param cfg_dict: A dictionary containing the pipeline setup parameters. :rtype: str :return: The local filepath of the downloaded s3 file. """ import os from indi_aws import fetch_creds, aws_utils from qap_utils import raise_smart_exception # Init variables working_dir = cfg_dict["working_directory"] try: creds_path = cfg_dict["creds_path"] except KeyError: creds_path = None if "s3://" in s3_path: s3_prefix = s3_path.replace("s3://", "") else: err = "[!] S3 filepaths must be pre-pended with the 's3://' prefix." raise_smart_exception(locals(), err) bucket_name = s3_prefix.split("/")[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) data_dir = s3_path.split(bucket_name + "/")[1] local_dl = os.path.join(working_dir, data_dir) if os.path.isfile(local_dl): print "\nS3 bucket file already downloaded! Skipping download." print "S3 file: %s" % s3_path print "Local file already exists: %s\n" % local_dl else: aws_utils.s3_download(bucket, ([data_dir], [local_dl])) return local_dl
def download_single_s3_path(s3_path, cfg_dict): """Download a single file from an AWS s3 bucket. :type s3_path: str :param s3_path: An "s3://" pre-pended path to a file stored on an Amazon AWS s3 bucket. :type cfg_dict: dictionary :param cfg_dict: A dictionary containing the pipeline setup parameters. :rtype: str :return: The local filepath of the downloaded s3 file. """ import os from indi_aws import fetch_creds, aws_utils from qap_utils import raise_smart_exception # Init variables working_dir = cfg_dict["working_directory"] try: creds_path = cfg_dict["creds_path"] except KeyError: creds_path = None if "s3://" in s3_path: s3_prefix = s3_path.replace("s3://","") else: err = "[!] S3 filepaths must be pre-pended with the 's3://' prefix." raise_smart_exception(locals(),err) bucket_name = s3_prefix.split("/")[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) data_dir = s3_path.split(bucket_name + "/")[1] local_dl = os.path.join(working_dir, data_dir) if os.path.isfile(local_dl): print "\nS3 bucket file already downloaded! Skipping download." print "S3 file: %s" % s3_path print "Local file already exists: %s\n" % local_dl else: aws_utils.s3_download(bucket, ([data_dir], [local_dl])) return local_dl
def load_yaml_config(config_filename, aws_input_creds): if config_filename.lower().startswith('data:'): try: header, encoded = config_filename.split(",", 1) config_content = b64decode(encoded) config_data = yaml.load(config_content) return config_data except: print("Error! Could not find load config from data URI") raise if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) prefix = config_filename.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) downloaded_config = '/tmp/' + os.path.basename(config_filename) bucket.download_file(prefix, downloaded_config) config_filename = downloaded_config config_filename = os.path.realpath(config_filename) try: with open(config_filename, 'r') as f: config_data = yaml.load(f) return config_data except IOError: print("Error! Could not find config file {0}".format(config_filename)) raise
def download_from_s3(s3_path, local_path, s3_creds): import os try: from indi_aws import fetch_creds, aws_utils except: err = "\n\n[!] You need the INDI AWS package installed in order to " \ "pull from an S3 bucket. Try 'pip install indi_aws'\n\n" raise Exception(err) s3_path = s3_path.replace("s3://","") bucket_name = s3_path.split("/")[0] bucket_prefix = s3_path.split(bucket_name + "/")[1] filename = s3_path.split("/")[-1] local_file = os.path.join(local_path, filename) if not os.path.exists(local_file): bucket = fetch_creds.return_bucket(s3_creds, bucket_name) aws_utils.s3_download(bucket, ([bucket_prefix], [local_file])) return local_file
def pull_s3_sublist(data_folder, creds_path=None): """Create a list of filepaths stored on the Amazon S3 bucket. :type data_folder: str :param data_folder: The full S3 (s3://) path to the directory holding the data. :type creds_path: str :param creds_path: The filepath to your Amazon AWS keys. :rtype: list :return: A list of Amazon S3 filepaths from the bucket and bucket directory you provided. """ import os from indi_aws import fetch_creds if creds_path: creds_path = os.path.abspath(creds_path) s3_path = data_folder.split("s3://")[1] bucket_name = s3_path.split("/")[0] bucket_prefix = s3_path.split(bucket_name + "/")[1] s3_list = [] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # ensure slash at end of bucket_prefix, so that if the final # directory name is a substring in other directory names, these # other directories will not be pulled into the file list if "/" not in bucket_prefix[-1]: bucket_prefix += "/" # Build S3-subjects to download for bk in bucket.objects.filter(Prefix=bucket_prefix): s3_list.append(str(bk.key).replace(bucket_prefix, "")) return s3_list
def upl_qap_output(cfg_file): """Upload a pipeline output file to an AWS S3 bucket. :type cfg_file: str :param cfg_file: Filepath to the pipeline configuration file containing S3 bucket and AWS credentials information. """ # Import packages from indi_aws import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file["pipeline_config_yaml"], 'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using INDI AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, (upl_files, s3_upl_files))
def upl_qap_output(cfg_file): """Upload a pipeline output file to an AWS S3 bucket. :type cfg_file: str :param cfg_file: Filepath to the pipeline configuration file containing S3 bucket and AWS credentials information. """ # Import packages from indi_aws import aws_utils, fetch_creds import os import yaml # Load config file with open(cfg_file["pipeline_config_yaml"],'r') as f: cfg_dict = yaml.load(f) # Init variables bucket_name = cfg_dict["bucket_name"] bucket_out_prefix = cfg_dict["bucket_prefix"] creds_path = cfg_dict["creds_path"] bucket = fetch_creds.return_bucket(creds_path, bucket_name) output_dir = cfg_dict['output_directory'] # And upload data upl_files = [] for root, dirs, files in os.walk(output_dir): if files: upl_files.extend([os.path.join(root, fil) for fil in files]) # Using INDI AWS utils s3_upl_files = [ufile.replace(output_dir, bucket_out_prefix) \ for ufile in upl_files] aws_utils.s3_upload(bucket, (upl_files, s3_upl_files))
def write_yaml_config(config_filename, body, aws_output_creds): if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) s3_key = config_filename.replace(s3_prefix, '').lstrip('/') if aws_output_creds: if not os.path.isfile(aws_output_creds): raise IOError("Could not find aws_output_creds (%s)" % (aws_output_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_output_creds, bucket_name) bucket.put_object(Body=body, Key=s3_key) config_filename = '/scratch/' + os.path.basename(config_filename) with open(config_filename, 'w') as ofd: ofd.writelines(body) return (config_filename)
def write_yaml_config(config_filename, body, aws_output_creds): if config_filename.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = config_filename.split('/')[2] s3_prefix = '/'.join(config_filename.split('/')[:3]) s3_key = config_filename.replace(s3_prefix, '').lstrip('/') if aws_output_creds: if not os.path.isfile(aws_output_creds): raise IOError("Could not find aws_output_creds (%s)" % (aws_output_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_output_creds, bucket_name) bucket.put_object(Body=body, Key=s3_key) config_filename = '/scratch/'+os.path.basename(config_filename) with open(config_filename, 'w') as ofd: ofd.writelines(body) return(config_filename)
import os if __name__ == "__main__": s3_bucket = "fcp-indi" s3_creds = "/Users/cameron.craddock/AWS/ccraddock-fcp-indi-keys2.csv" s3_prefix = "data/Projects/ADHD200/RawDataBIDS" s3_sitedirs = ["Brown","KKI","NeuroIMAGE","NYU","OHSU","Peking_1","Peking_2","Peking_3","Pittsburgh","WashU"] out_prefix = "data/ADHD200/RawDataBIDS" max_subjs = 4 if s3_creds: if not os.path.isfile(s3_creds): raise IOError("Could not filed aws_input_creds (%s)" % (s3_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(s3_creds,s3_bucket) for site in s3_sitedirs: subjects=[] prefix=os.path.join(s3_prefix,site) print "gathering files from S3 bucket (%s) for %s" % (bucket, prefix) for s3_obj in bucket.objects.filter(Prefix=prefix): if 'T1w' in str(s3_obj.key) or 'bold' in str(s3_obj.key): fname = os.path.basename(str(s3_obj.key)) if "sub-" not in fname: if not os.path.exists(os.path.dirname(s3_obj.key).replace(s3_prefix,out_prefix)): print "making the directory" os.makedirs(os.path.dirname(s3_obj.key).replace(s3_prefix,out_prefix)) print "downloading %s to %s"%(str(s3_obj.key),str(s3_obj.key).replace(s3_prefix,out_prefix))
def main(): ''' This function runs the main routine ''' # Import packages from indi_aws import fetch_creds import os import yaml # Init variables creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' bucket = fetch_creds.return_bucket('fcp-indi', creds_path) bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' sub_fp = '/home/ubuntu/abide/preprocessing/yamls/subs_list.yml' sub_list = yaml.load(open(sub_fp, 'r')) example_subid = '0050002_session_1' # Populate list of files to link to #src_list = [] #src_list = gather_files_tosort(src_list, bucket, bucket_prefix) # Derivatives dictionary {name: (no_files_per_strategy, filt_str)} strat_dict = { 'nofilt_noglobal': ['pipeline_abide_rerun', 'global0'], 'nofilt_global': ['pipeline_abide_rerun', 'global1'], 'filt_noglobal': ['pipeline_abide_rerun__freq-filter', 'global0'], 'filt_global': ['pipeline_abide_rerun__freq-filter', 'global1'] } derivs_dict = { 'alff': (1, 'alff_to_standard_smooth', 'nii.gz'), 'degree_binarize': (1, 'centrality_outputs_smoothed', 'degree_centrality_binarize'), 'degree_weighted': (1, 'centrality_outputs_smoothed', 'degree_centrality_weighted'), 'dual_regression': (1, 'dr_tempreg_maps_zstat_stack_to_standard_smooth', 'nii.gz'), 'eigenvector_binarize': (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_binarize'), 'eigenvector_weighted': (1, 'centrality_outputs_smoothed', 'eigenvector_centrality_weighted'), 'falff': (1, 'falff_to_standard_smooth', 'nii.gz'), 'func_mask': (1, 'functional_brain_mask_to_standard', 'nii.gz'), 'func_mean': (1, 'mean_functional_in_mni', 'nii.gz'), 'func_preproc': (1, 'functional_mni', '.nii.gz'), 'lfcd': (1, 'centrality_outputs_smoothed', 'lfcd_binarize'), 'reho': (1, 'reho_to_standard_smooth', 'nii.gz'), 'rois_aal': (4, 'roi_timeseries', 'aal'), 'rois_cc200': (4, 'roi_timeseries', 'CC200'), 'rois_cc400': (4, 'roi_timeseries', 'CC400'), 'rois_dosenbach160': (4, 'roi_timeseries', 'rois_3mm'), 'rois_ez': (4, 'roi_timeseries', 'ez'), 'rois_ho': (4, 'roi_timeseries', 'ho_'), 'rois_tt': (4, 'roi_timeseries', 'tt'), 'vmhc': (1, 'vmhc_fisher_zstd_zstat_map', 'nii.gz') } # Create error and output dictionaries out_dict = { k: {kk: [] for kk in derivs_dict.keys()} for k in strat_dict.keys() } err_dict = { k: {kk: [] for kk in derivs_dict.keys()} for k in strat_dict.keys() } # Iterate through strategies for strat, filts in strat_dict.items(): print('building %s...' % strat) filt = filts[0] g_sig = filts[1] strat_prefix = os.path.join(bucket_prefix, filt, example_subid) # Iterate through derivatives for deriv, v in derivs_dict.items(): num_files = v[0] deriv_folder = v[1] name_filter = v[2] deriv_prefix = os.path.join(strat_prefix, deriv_folder) keys_list = [] for key in bucket.list(prefix=deriv_prefix): k_name = str(key.name) # If global signal regression was used or didnt need to be if (g_sig in k_name or 'global' not in k_name) and \ name_filter in k_name: keys_list.append(k_name) # Grab only wanted results from keys if len(keys_list) == num_files: out_dict[strat][deriv] = [ k for k in keys_list if '.nii.gz' in k or '.1D' in k ][0] else: err_dict[strat][deriv] = keys_list print('error in number of files!') # Go through dictionary and build paths mapping_dict = {} s = 1 # For each subject for sub in sub_list: subid = sub.split('_')[-1] + '_session_1' print('populating %s...%d' % (subid, s)) # For each strategy for strat, deriv_dict in out_dict.items(): strat_prefix = os.path.join(bucket_prefix, strat) # For each derivative, generate src and dst filepaths d = 0 for deriv, filepath in deriv_dict.items(): deriv_prefix = os.path.join(strat_prefix, deriv, sub + '_' + deriv) # Check extensions if filepath.endswith('.nii.gz'): dst_path = deriv_prefix + '.nii.gz' elif filepath.endswith('.1D'): dst_path = deriv_prefix + '.1D' else: raise Exception('Bad extension type') # Get sub id from filepath src_path = filepath.replace(example_subid, subid) mapping_dict[src_path] = dst_path d += 1 if d != 20: print(d) raw_input('not enough dervivs') s += 1 # Return return out_dict, err_dict, mapping_dict
# for l in infd.readlines(): # file_paths.append(l.rstrip()) bucket_name = args.bids_dir.split('/')[2] s3_prefix = '/'.join(args.bids_dir.split('/')[:3]) prefix = args.bids_dir.replace(s3_prefix, '').lstrip('/') creds_path = "" if args.aws_input_creds: if not os.path.isfile(args.aws_input_creds): raise IOError("Could not filed aws_input_creds (%s)" % (args.aws_input_creds)) creds_path = args.aws_input_creds from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(creds_path, bucket_name) print "Gathering data from S3 bucket, this may take a while" obj_count = 0 if args.participant_label: for pt in args.participant_label: pt = pt.lstrip("sub-") t_prefix = "%/sub-%s" % (prefix, pt) for s3_obj in bucket.objects.filter(Prefix=t_prefix): obj_count+=1 if obj_count % 1000 == 0: print "%dk"%(obj_count//1000) file_paths.append(os.path.join(s3_prefix, str(s3_obj.key))) else:
def check_for_s3(file_path, creds_path, dl_dir=None, img_type='anat'): # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = 's3://' if dl_dir is None: dl_dir = os.getcwd() # Explicitly lower-case the "s3" if file_path.lower().startswith(s3_str): file_path_sp = file_path.split('/') file_path_sp[0] = file_path_sp[0].lower() file_path = '/'.join(file_path_sp) # Check for s3 string in filepaths if file_path.startswith(s3_str): # Get bucket name and bucket object bucket_name = file_path.replace(s3_str, '').split('/')[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Extract relative key path from bucket and local path s3_prefix = os.path.join(s3_str, bucket_name) s3_key = file_path.replace(s3_prefix, '').lstrip('/') local_path = os.path.join(dl_dir, os.path.basename(s3_key)) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file try: bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response['Error']['Code']) if error_code == 403: err_msg = 'Access to bucket: "%s" is denied; using credentials '\ 'in subject list: "%s"; cannot access the file "%s"'\ % (bucket_name, creds_path, file_path) raise Exception(err_msg) elif error_code == 404: err_msg = 'Bucket: "%s" does not exist; check spelling and try '\ 'again' % bucket_name raise Exception(err_msg) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check image dimensionality if '.nii' in local_path: try: img_nii = nib.load(local_path) except Exception as e: # TODO: come up with a better option for handling rogue S3 files # TODO: that Nibabel chokes on print(str(e)) return local_path if img_type == 'anat': if len(img_nii.shape) != 3: raise IOError('File: %s must be an anatomical image with 3 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == 'func': if len(img_nii.shape) != 4: raise IOError('File: %s must be a functional image with 4 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) # Return the local path return local_path
def return_s3_filepaths(base_dir, creds_path=None): ''' Function to return the filepaths from an S3 bucket given a file pattern template and, optionally, credentials Parameters ---------- creds_path : string (optional); default=None filepath to a credentials file containing the AWS credentials to access the S3 bucket objects Returns ------- matched_s3_paths : list a list of strings of the filepaths from the S3 bucket ''' # Import packages import logging import os from indi_aws import fetch_creds # # Check for errors # if not bids_base_dir: # if not ('{site}' in path_template and '{participant}' in path_template): # err_msg = 'Please provide \'{site}\' and \'{particpant}\' in '\ # 'filepath template where site and participant-level '\ # 'directories are present' # raise Exception(err_msg) # if running this with "Custom" (non-BIDS) file templates if '{site}' in base_dir: base_dir = base_dir.split('{site}')[0] elif '{participant}' in base_dir: base_dir = base_dir.split('{participant}')[0] # Init variables bucket_name = base_dir.split('/')[2] s3_prefix = '/'.join(base_dir.split('/')[:3]) # Get logger logger = logging.getLogger('sublist_builder') # Extract base prefix to search through in S3 prefix = base_dir.replace(s3_prefix, '').lstrip('/') # Attempt to get bucket try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) except Exception as exc: err_msg = 'There was an error in retrieving S3 bucket: %s.\n' \ 'Error: %s' % (bucket_name, exc) logger.error(err_msg) raise Exception(err_msg) # Get filepaths from S3 with prefix logger.info('Gathering files from S3 to parse...') s3_filepaths = [] for s3_obj in bucket.objects.filter(Prefix=prefix): s3_filepaths.append(str(s3_obj.key)) # Prepend 's3://bucket_name/' on found paths s3_filepaths = [os.path.join(s3_prefix, s3_fp) for s3_fp in s3_filepaths] return s3_filepaths
def check_for_s3(file_path, creds_path, dl_dir=None, img_type="anat"): """ """ # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = "s3://" if dl_dir is None: dl_dir = os.getcwd() # Explicitly lower-case the "s3" if file_path.lower().startswith(s3_str): file_path_sp = file_path.split("/") file_path_sp[0] = file_path_sp[0].lower() file_path = "/".join(file_path_sp) # Check for s3 string in filepaths if file_path.startswith(s3_str): # Get bucket name and bucket object bucket_name = file_path.replace(s3_str, "").split("/")[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Extract relative key path from bucket and local path s3_prefix = os.path.join(s3_str, bucket_name) s3_key = file_path.replace(s3_prefix, "").lstrip("/") local_path = os.path.join(dl_dir, os.path.basename(s3_key)) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file try: bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response["Error"]["Code"]) if error_code == 403: err_msg = ( 'Access to bucket: "%s" is denied; using credentials ' 'in subject list: "%s"; cannot access the file "%s"' % (bucket_name, creds_path, file_path) ) raise Exception(err_msg) elif error_code == 404: err_msg = 'Bucket: "%s" does not exist; check spelling and try ' "again" % bucket_name raise Exception(err_msg) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s' % (bucket_name, exc) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s' % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check image dimensionality img_nii = nib.load(local_path) if img_type == "anat": if len(img_nii.shape) != 3: raise IOError( "File: %s must be an anatomical image with 3 " "dimensions but %d dimensions found!" % (local_path, len(img_nii.shape)) ) elif img_type == "func": if len(img_nii.shape) != 4: raise IOError( "File: %s must be a functional image with 4 " "dimensions but %d dimensions found!" % (local_path, len(img_nii.shape)) ) # Return the local path return local_path
def collect_bids_files_configs(bids_dir, aws_input_creds=''): """ :param bids_dir: :param aws_input_creds: :return: """ file_paths = [] config_dict = {} suffixes = ['T1w', 'bold', '_epi', 'phasediff', 'magnitude', 'magnitude1', 'magnitude2'] if bids_dir.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = bids_dir.split('/')[2] s3_prefix = '/'.join(bids_dir.split('/')[:3]) prefix = bids_dir.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) print(f"gathering files from S3 bucket ({bucket}) for {prefix}") for s3_obj in bucket.objects.filter(Prefix=prefix): for suf in suffixes: if suf in str(s3_obj.key): if str(s3_obj.key).endswith("json"): try: config_dict[s3_obj.key.replace(prefix, "").lstrip('/')] \ = json.loads(s3_obj.get()["Body"].read()) except Exception as e: print("Error retrieving %s (%s)" % (s3_obj.key.replace(prefix, ""), e.message)) raise elif 'nii' in str(s3_obj.key): file_paths.append(str(s3_obj.key) .replace(prefix,'').lstrip('/')) else: for root, dirs, files in os.walk(bids_dir, topdown=False): if files: for f in files: for suf in suffixes: if 'nii' in f and suf in f: file_paths += [os.path.join(root, f).replace(bids_dir,'') .lstrip('/')] if f.endswith('json') and suf in f: try: config_dict.update( {os.path.join(root.replace(bids_dir, '').lstrip('/'), f): json.load(open(os.path.join(root, f), 'r'))}) except UnicodeDecodeError: raise Exception("Could not decode {0}".format(os.path.join(root, f))) if not file_paths and not config_dict: raise IOError("Didn't find any files in {0}. Please verify that the " "path is typed correctly, that you have read access to " "the directory, and that it is not " "empty.".format(bids_dir)) return file_paths, config_dict
def check_for_s3(file_path, creds_path=None, dl_dir=None, img_type='other'): # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = 's3://' if creds_path: if "None" in creds_path or "none" in creds_path or \ "null" in creds_path: creds_path = None if dl_dir is None: dl_dir = os.getcwd() if file_path is None: # in case it's something like scan parameters or field map files, but # we don't have any local_path = file_path return local_path # TODO: remove this once scan parameter input as dictionary is phased out if isinstance(file_path, dict): # if this is a dictionary, just skip altogether local_path = file_path return local_path # Explicitly lower-case the "s3" if file_path.lower().startswith(s3_str): file_path = s3_str + file_path[len(s3_str):] # Get bucket name and bucket object bucket_name = file_path[len(s3_str):].split('/')[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Extract relative key path from bucket and local path s3_prefix = s3_str + bucket_name s3_key = file_path[len(s3_prefix) + 1:] local_path = os.path.join(dl_dir, bucket_name, s3_key) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file try: print("Attempting to download from AWS S3: {0}".format(file_path)) bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response['Error']['Code']) if error_code == 403: err_msg = 'Access to bucket: "%s" is denied; using credentials '\ 'in subject list: "%s"; cannot access the file "%s"'\ % (bucket_name, creds_path, file_path) raise Exception(err_msg) elif error_code == 404: err_msg = 'File: {0} does not exist; check spelling and try '\ 'again'.format(os.path.join(bucket_name, s3_key)) raise Exception(err_msg) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check image dimensionality if local_path.endswith('.nii') or local_path.endswith('.nii.gz'): try: img_nii = nib.load(local_path) except Exception as e: # TODO: come up with a better option for handling rogue S3 files # TODO: that Nibabel chokes on print(str(e)) return local_path if img_type == 'anat': if len(img_nii.shape) != 3: raise IOError('File: %s must be an anatomical image with 3 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == 'func': if len(img_nii.shape) != 4: raise IOError('File: %s must be a functional image with 4 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) # Return the local path return local_path
def testConfig(self, event): ''' This function runs when the user clicks the "Test Configuration" button in the pipeline configuration window. It prompts the user for a sample subject list (i.e. one that they will be using with the config they are building). Then it builds the pipeline but does not run it. It then reports whether or not the config will run or not depending on if the pipeline gets built successfully. ''' # Import packages import os import yaml from CPAC.utils import Configuration from CPAC.pipeline.cpac_pipeline import prep_workflow from CPAC.pipeline.cpac_runner import build_strategies def display(win, msg, changeBg=True): wx.MessageBox(msg, "Error") if changeBg: win.SetBackgroundColour("pink") win.SetFocus() win.Refresh() # Collect a sample subject list and parse it in testDlg0 = wx.MessageDialog( self, 'This tool will run a quick check on the current pipeline ' 'configuration. Click OK to provide a subject list you ' 'will be using with this setup.', 'Subject List', wx.OK | wx.ICON_INFORMATION) testDlg0.ShowModal() testDlg0.Destroy() dlg = wx.FileDialog( self, message="Choose the CPAC Subject list file", defaultDir=os.getcwd(), defaultFile="CPAC_subject_list.yml", wildcard="YAML files(*.yaml, *.yml)|*.yaml;*.yml", style=wx.OPEN | wx.CHANGE_DIR) if dlg.ShowModal() == wx.ID_OK: subListPath = dlg.GetPath() # Load and test the subject list print 'Checking subject list: %s...' % subListPath sublist = yaml.load(open(os.path.realpath(subListPath), 'r')) sub_flg = self.test_sublist(sublist) if not sub_flg: raise Exception print 'Subject list looks good!' # Following code reads in the parameters and selections from the # pipeline configuration window and populate the config_list config_list = [] wf_counter = [] for page in self.nb.get_page_list(): switch = page.page.get_switch() ctrl_list = page.page.get_ctrl_list() validate = False if switch: switch_val = str(switch.get_selection()).lower() if switch_val == 'on' or switch_val == 'true' or \ switch_val == '1': validate = True wf_counter.append(page.get_counter()) for ctrl in ctrl_list: # option_name will be the selection name as it is written # as the dictionary key of the config.yml dictionary option_name = ctrl.get_name() #validating if (switch == None or validate) and ctrl.get_validation() \ and (option_name != 'derivativeList') and \ (option_name != 'modelConfigs'): win = ctrl.get_ctrl() if isinstance(ctrl.get_selection(), list): value = ctrl.get_selection() if not value: display( win, "%s field is empty or the items are " \ "not checked!" % ctrl.get_name(), False) return elif (option_name == "tsa_roi_paths") or \ (option_name == "sca_roi_paths"): # fires if the control is the checkbox grid for # multiple paths assigned to multiple options # (i.e. timeseries analysis) config_list.append(ctrl) continue else: value = str(ctrl.get_selection()) if len(value) == 0: display(win, "%s field is empty!" % ctrl.get_name()) return if '/' in value and '$' not in value and not \ isinstance(value, list): if not os.path.exists(ctrl.get_selection()) and \ value != 'On/Off': display( win, "%s field contains incorrect path. " \ "Please update the path!" % ctrl.get_name()) return config_list.append(ctrl) # Write out a pipeline_config file, read it in and then delete it # (Will revise the data structure of the config files later so this # can just pass the data structure instead of doing it this way) try: test_cfg_yml = '/tmp/test_config.yml' self.write(test_cfg_yml, config_list) c = Configuration(yaml.load(open(os.path.realpath(test_cfg_yml), 'r'))) os.remove(test_cfg_yml) except: errDlg2 = wx.MessageDialog( self, 'A problem occurred with preparing the pipeline test run. \n\n' \ 'Please ensure you have rights access to the directories you' \ ' have chosen for the CPAC working, crash, and output folders.', 'Test Configuration Error', wx.OK | wx.ICON_ERROR) errDlg2.ShowModal() errDlg2.Destroy() if (1 in c.runNuisance) or (c.Regressors != None): strategies = sorted(build_strategies(c)) else: strategies = None # Run the actual pipeline building prep and see if it works or not testDlg1 = wx.MessageDialog( self, 'Click OK to run the test. This should take only a few seconds.', 'Running Test', wx.OK | wx.ICON_INFORMATION) testDlg1.ShowModal() # Check file paths first # Just getting proper names of config file parameters try: params_file = open(p.resource_filename('CPAC', 'GUI/resources/config_parameters.txt'), "r") except: print "Error: Could not open configuration parameter file.", "\n" raise Exception paramInfo = params_file.read().split('\n') paramList = [] for param in paramInfo: if param != '': paramList.append(param.split(',')) # function for file path checking def testFile(filepath, paramName, switch): try: if (1 in switch) and (filepath != None): fileTest = open(filepath) fileTest.close() except: testDlg1.Destroy() for param in paramList: if param[0] == paramName: paramTitle = param[1] paramGroup = param[2] break errDlgFileTest = wx.MessageDialog( self, 'Error reading file - either it does not exist or '\ 'you do not have read access. \n\n' \ 'Parameter: %s \n' \ 'In tab: %s \n\n' \ 'Path: %s' % (paramTitle, paramGroup, filepath), 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlgFileTest.ShowModal() errDlgFileTest.Destroy() # Check S3 output bucket access if writing to S3 output_dir = c.outputDirectory s3_str = 's3://' if output_dir.lower().startswith(s3_str): output_dir_sp = output_dir.split('/') output_dir_sp[0] = output_dir_sp[0].lower() output_dir = '/'.join(output_dir_sp) if type(output_dir) is str and output_dir.lower().startswith(s3_str): from indi_aws import fetch_creds creds_path = c.awsOutputBucketCredentials bucket_name = output_dir.split(s3_str)[1].split('/')[0] try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) print 'Connection with output bucket "%s" successful!' % bucket_name except Exception as exc: err_msg = 'Unable to access output S3 bucket: "%s" with '\ 'credentials in: "%s". Check bucket name '\ 'and credentials file and try again'\ % (bucket_name, creds_path) testDlg1.Destroy() errDlg1 = wx.MessageDialog(self, err_msg, 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlg1.ShowModal() errDlg1.Destroy() return testFile(c.template_brain_only_for_anat, \ 'template_brain_only_for_anat',[1]) testFile(c.template_skull_for_anat,'template_skull_for_anat',[1]) testFile(c.PRIORS_WHITE,'PRIORS_WHITE',c.runSegmentationPreprocessing) testFile(c.PRIORS_GRAY,'PRIORS_GRAY',c.runSegmentationPreprocessing) testFile(c.PRIORS_CSF,'PRIORS_CSF',c.runSegmentationPreprocessing) testFile(c.template_brain_only_for_func, \ 'template_brain_only_for_func',c.runRegisterFuncToMNI) testFile(c.template_skull_for_func,'template_skull_for_func', \ c.runRegisterFuncToMNI) testFile(c.identityMatrix,'identityMatrix',c.runRegisterFuncToMNI) testFile(c.boundaryBasedRegistrationSchedule, \ 'boundaryBasedRegistrationSchedule', \ c.runRegisterFuncToAnat) testFile(c.lateral_ventricles_mask,'lateral_ventricles_mask', \ c.runNuisance) testFile(c.template_symmetric_brain_only, \ 'template_symmetric_brain_only',c.runVMHC) testFile(c.template_symmetric_skull,'template_symmetric_skull', \ c.runVMHC) testFile(c.dilated_symmetric_brain_mask, \ 'dilated_symmetric_brain_mask',c.runVMHC) testFile(c.configFileTwomm,'configFileTwomm',c.runVMHC) testFile(c.templateSpecificationFile,'templateSpecificationFile', \ c.runNetworkCentrality) if c.tsa_roi_paths and type(c.tsa_roi_paths[0]) == dict: for roi_path in c.tsa_roi_paths[0].keys(): testFile(roi_path, "tsa_roi_paths", c.runROITimeseries) if c.sca_roi_paths and type(c.sca_roi_paths[0]) == dict: for roi_path in c.sca_roi_paths[0].keys(): testFile(roi_path, "sca_roi_paths", c.runSCA) try: # Run the pipeline building prep_workflow(sublist[0], c, strategies, 0) except Exception as xxx: print xxx print "an exception occurred" testDlg1.Destroy() errDlg1 = wx.MessageDialog( self, 'There are issues with the current configuration ' \ 'which need to be resolved - please check to make ' \ 'sure the options you are running have the proper ' \ 'pre-requisites selected.\n\nIssue Info:\n%s' \ % str(xxx), 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlg1.ShowModal() errDlg1.Destroy() else: testDlg1.Destroy() okDlg1 = wx.MessageDialog( self, 'The current configuration will run successfully. You '\ 'can safely save and run this setup!', 'Pipeline Ready', wx.OK | wx.ICON_INFORMATION) okDlg1.ShowModal() okDlg1.Destroy()
def return_bids_template(base_dir, scan_type, creds_path=None): ''' Function that returns the path template of the desired scan type from a BIDS dataset Parameters ---------- base_dir : string base directory of the BIDS dataset scan_type : string type of scan; e.g. 'anat', 'func', etc. creds_path : string (optional); default=None filepath to a set of AWS credentials to access a BIDS dataset stored on S3 that isn't public Returns ------- file_template : string regular expression-compatible file template indicating data path organization ''' # Import packages import os from indi_aws import fetch_creds # Init variables s3_str = 's3://' file_path = None # If base directory is in S3 if base_dir.startswith(s3_str): bucket_name = base_dir.split('/')[2] s3_prefix = '/'.join(base_dir.split('/')[:3]) # Extract base prefix to search through in S3 prefix = base_dir.split('*')[0].replace(s3_prefix, '').lstrip('/') # Attempt to get bucket try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) except Exception as exc: err_msg = 'There was an error in retrieving S3 bucket: %s.\nError: %s'\ %(bucket_name, exc) raise Exception(err_msg) # Get filepaths from S3 with prefix print 'Gathering files from S3 to parse...' for s3_obj in bucket.objects.filter(Prefix=prefix): file_path = s3_obj.key scan_dir = file_path.split('/')[-2] if scan_dir == scan_type: break # Else, the base directory is locally stored else: for root, dirs, files in os.walk(base_dir): if file_path: break for fil in files: file_path = os.path.join(root, fil) scan_dir = file_path.split('/')[-2] if fil.endswith('.nii.gz') and scan_dir == scan_type: break else: file_path = None # Now replace file_path intermediate dirs with * if file_path: rel_path = file_path.replace(base_dir, '').lstrip('/') interm_dirs = rel_path.split('/')[:-2] for imd in interm_dirs: file_path = file_path.replace(imd, '*') else: err_msg = 'Could not find any files in directory, check files!' raise Exception(err_msg) # Set template as any file * file_template = os.path.join(os.path.dirname(file_path), '*.nii.gz') # Return file pattern template return file_template
def check_for_s3(file_path, creds_path, dl_dir=None, img_type='anat'): # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = 's3://' if dl_dir is None: dl_dir = os.getcwd() if file_path is None: # in case it's something like scan parameters or field map files, but # we don't have any local_path = file_path return local_path # TODO: remove this once scan parameter input as dictionary is phased out if isinstance(file_path, dict): # if this is a dictionary, just skip altogether local_path = file_path return local_path # Explicitly lower-case the "s3" if file_path.lower().startswith(s3_str): file_path_sp = file_path.split('/') file_path_sp[0] = file_path_sp[0].lower() file_path = '/'.join(file_path_sp) # Check for s3 string in filepaths if file_path.startswith(s3_str): # Get bucket name and bucket object bucket_name = file_path.replace(s3_str, '').split('/')[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Extract relative key path from bucket and local path s3_prefix = os.path.join(s3_str, bucket_name) s3_key = file_path.replace(s3_prefix, '').lstrip('/') local_path = os.path.join(dl_dir, s3_key) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file try: print("Attempting to download from AWS S3: {0}".format(file_path)) bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response['Error']['Code']) if error_code == 403: err_msg = 'Access to bucket: "%s" is denied; using credentials '\ 'in subject list: "%s"; cannot access the file "%s"'\ % (bucket_name, creds_path, file_path) raise Exception(err_msg) elif error_code == 404: err_msg = 'Bucket: "%s" does not exist; check spelling and try '\ 'again' % bucket_name raise Exception(err_msg) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check image dimensionality if '.nii' in local_path: try: img_nii = nib.load(local_path) except Exception as e: # TODO: come up with a better option for handling rogue S3 files # TODO: that Nibabel chokes on print(str(e)) return local_path if img_type == 'anat': if len(img_nii.shape) != 3: raise IOError('File: %s must be an anatomical image with 3 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == 'func': if len(img_nii.shape) != 4: raise IOError('File: %s must be a functional image with 4 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == "other": pass # Return the local path return local_path
def return_s3_filepaths(path_template, creds_path=None, bids_flag=False): ''' Function to return the filepaths from an S3 bucket given a file pattern template and, optionally, credentials Parameters ---------- path_template : string filepath template in the form of: 's3://bucket_name/base_dir/{site}/{participant}/{session}/.. ../file.nii.gz'; if bids_flag is set, path_template is just the base directory of the BIDS data set creds_path : string (optional); default=None filepath to a credentials file containing the AWS credentials to access the S3 bucket objects bids_flag : boolean (optional); default=False flag to indicate if the dataset to gather is organized to the BIDS standard Returns ------- matched_s3_paths : list a list of strings of the filepaths from the S3 bucket ''' # Import packages import fnmatch import logging import os import re from indi_aws import fetch_creds # Check for errors if not bids_flag: if not ('{site}' in path_template and '{participant}' in path_template): err_msg = 'Please provide \'{site}\' and \'{particpant}\' in '\ 'filepath template where site and participant-level '\ 'directories are present' raise Exception(err_msg) # Init variables bucket_name = path_template.split('/')[2] s3_prefix = '/'.join(path_template.split('/')[:3]) # Get logger logger = logging.getLogger('sublist_builder') # Extract base prefix to search through in S3 if bids_flag: prefix = path_template.split('*')[0].replace(s3_prefix, '').lstrip('/') else: prefix = path_template.split('{site}')[0].replace(s3_prefix, '').lstrip('/') # Attempt to get bucket try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) except Exception as exc: err_msg = 'There was an error in retrieving S3 bucket: %s.\nError: %s'\ %(bucket_name, exc) logger.error(err_msg) raise Exception(err_msg) # Get filepaths from S3 with prefix logger.info('Gathering files from S3 to parse...') s3_filepaths = [] for s3_obj in bucket.objects.filter(Prefix=prefix): s3_filepaths.append(str(s3_obj.key)) # Prepend 's3://bucket_name/' on found paths s3_filepaths = [os.path.join(s3_prefix, s3_fp) for s3_fp in s3_filepaths] # File pattern filter if bids_flag: file_pattern = path_template else: file_pattern = path_template.replace('{site}', '*').\ replace('{participant}', '*').replace('{session}', '*') # Get only matching s3 paths s3_filepaths = fnmatch.filter(s3_filepaths, file_pattern) # Restrict filepaths and pattern to be of same directory depth # as fnmatch will expand /*/ recursively to .../*/*/... matched_s3_paths = [] for s3fp in s3_filepaths: s3_split = s3fp.split('/') fp_split = file_pattern.split('/') if len(s3_split) == len(fp_split): matched_s3_paths.append(s3fp) # Print how many found num_s3_files = len(matched_s3_paths) logger.info('Found %d files!' % num_s3_files) # Return the filepaths as a list return matched_s3_paths
def check_for_s3(file_path, creds_path=None, dl_dir=None, img_type='other', verbose=False): # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = 's3://' if creds_path: if "None" in creds_path or "none" in creds_path or \ "null" in creds_path: creds_path = None if dl_dir is None: dl_dir = os.getcwd() if file_path is None: # in case it's something like scan parameters or field map files, but # we don't have any return None # TODO: remove this once scan parameter input as dictionary is phased out if isinstance(file_path, dict): # if this is a dictionary, just skip altogether local_path = file_path return local_path if file_path.lower().startswith(s3_str): file_path = s3_str + file_path[len(s3_str):] # Get bucket name and bucket object bucket_name = file_path[len(s3_str):].split('/')[0] # Extract relative key path from bucket and local path s3_prefix = s3_str + bucket_name s3_key = file_path[len(s3_prefix) + 1:] local_path = os.path.join(dl_dir, bucket_name, s3_key) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir, exist_ok=True) if os.path.exists(local_path): print("{0} already exists- skipping download.".format(local_path)) else: # Download file try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) print("Attempting to download from AWS S3: {0}".format( file_path)) bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response['Error']['Code']) err_msg = str(exc) if error_code == 403: err_msg = 'Access to bucket: "%s" is denied; using credentials ' \ 'in subject list: "%s"; cannot access the file "%s"' \ % (bucket_name, creds_path, file_path) elif error_code == 404: err_msg = 'File: {0} does not exist; check spelling and try ' \ 'again'.format( os.path.join(bucket_name, s3_key)) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s' \ % (bucket_name, exc) raise Exception(err_msg) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s' \ % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check if it exists or it is successfully downloaded if not os.path.exists(local_path): # alert users to 2020-07-20 Neuroparc atlas update (v0 to v1) ndmg_atlases = {} with open( os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources/templates/ndmg_atlases.csv') ) as ndmg_atlases_file: ndmg_atlases['v0'], ndmg_atlases['v1'] = zip( *[(f'/ndmg_atlases/label/Human/{atlas[0]}', f'/ndmg_atlases/label/Human/{atlas[1]}') for atlas in csv.reader(ndmg_atlases_file)]) if local_path in ndmg_atlases['v0']: raise FileNotFoundError(''.join([ 'Neuroparc atlas paths were updated on July 20, 2020. ' 'C-PAC configuration files using Neuroparc v0 atlas paths ' '(including C-PAC default and preconfigured pipeline ' 'configurations from v1.6.2a and earlier) need to be ' 'updated to use Neuroparc atlases. Your current ' 'configuration includes the Neuroparc v0 path ' f'{local_path} which needs to be updated to ', ndmg_atlases['v1'][ndmg_atlases['v0'].index(local_path)], '. For a full list such paths, see https://fcp-indi.' 'github.io/docs/nightly/user/ndmg_atlases' ])) else: raise FileNotFoundError(f'File {local_path} does not exist!') if verbose: print("Downloaded file:\n{0}\n".format(local_path)) # Check image dimensionality if local_path.endswith('.nii') or local_path.endswith('.nii.gz'): img_nii = nib.load(local_path) if img_type == 'anat': if len(img_nii.shape) != 3: raise IOError('File: %s must be an anatomical image with 3 ' \ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == 'func': if len(img_nii.shape) != 4: raise IOError('File: %s must be a functional image with 4 ' \ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) return local_path
def run_workflow(sub_dict, c, run, pipeline_timing_info=None, p_name=None, plugin='MultiProc', plugin_args=None, test_config=False): ''' Function to prepare and, optionally, run the C-PAC workflow Parameters ---------- sub_dict : dictionary subject dictionary with anatomical and functional image paths c : Configuration object CPAC pipeline configuration dictionary object run : boolean flag to indicate whether to run the prepared workflow pipeline_timing_info : list (optional); default=None list of pipeline info for reporting timing information p_name : string (optional); default=None name of pipeline plugin : string (optional); defaule='MultiProc' nipype plugin to utilize when the workflow is ran plugin_args : dictionary (optional); default=None plugin-specific arguments for the workflow plugin Returns ------- workflow : nipype workflow the prepared nipype workflow object containing the parameters specified in the config ''' # Assure that changes on config will not affect other parts c = copy.copy(c) subject_id = sub_dict['subject_id'] if sub_dict['unique_id']: subject_id += "_" + sub_dict['unique_id'] log_dir = os.path.join(c.pipeline_setup['log_directory']['path'], f'pipeline_{c.pipeline_setup["pipeline_name"]}', subject_id) if not os.path.exists(log_dir): os.makedirs(os.path.join(log_dir)) # TODO ASH Enforce c.run_logging to be boolean # TODO ASH Schema validation config.update_config({ 'logging': { 'log_directory': log_dir, 'log_to_file': bool(getattr(c.pipeline_setup['log_directory'], 'run_logging', True)) }, 'execution': { 'crashfile_format': 'txt' } }) config.enable_resource_monitor() logging.update_logging(config) # Start timing here pipeline_start_time = time.time() # at end of workflow, take timestamp again, take time elapsed and check # tempfile add time to time data structure inside tempfile, and increment # number of subjects # Check pipeline config resources sub_mem_gb, num_cores_per_sub, num_ants_cores, num_omp_cores = check_config_resources( c) if not plugin: plugin = 'MultiProc' if plugin_args: plugin_args['memory_gb'] = sub_mem_gb plugin_args['n_procs'] = num_cores_per_sub else: plugin_args = {'memory_gb': sub_mem_gb, 'n_procs': num_cores_per_sub} # perhaps in future allow user to set threads maximum # this is for centrality mostly # import mkl os.environ['OMP_NUM_THREADS'] = str(num_omp_cores) os.environ['MKL_NUM_THREADS'] = '1' # str(num_cores_per_sub) os.environ['ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS'] = str(num_ants_cores) # TODO: TEMPORARY # TODO: solve the UNet model hanging issue during MultiProc if "UNet" in c.anatomical_preproc['brain_extraction']['using']: c.pipeline_setup['system_config']['max_cores_per_participant'] = 1 logger.info("\n\n[!] LOCKING CPUs PER PARTICIPANT TO 1 FOR U-NET " "MODEL.\n\nThis is a temporary measure due to a known " "issue preventing Nipype's parallelization from running " "U-Net properly.\n\n") # calculate maximum potential use of cores according to current pipeline # configuration max_core_usage = int( c.pipeline_setup['system_config']['max_cores_per_participant']) * \ int(c.pipeline_setup['system_config'][ 'num_participants_at_once']) try: creds_path = sub_dict['creds_path'] if creds_path and 'none' not in creds_path.lower(): if os.path.exists(creds_path): input_creds_path = os.path.abspath(creds_path) else: err_msg = 'Credentials path: "%s" for subject "%s" was not ' \ 'found. Check this path and try again.' % ( creds_path, subject_id) raise Exception(err_msg) else: input_creds_path = None except KeyError: input_creds_path = None # TODO enforce value with schema validation try: encrypt_data = bool( config.pipeline_setup['Amazon-AWS']['s3_encryption']) except: encrypt_data = False information = """ C-PAC version: {cpac_version} Setting maximum number of cores per participant to {cores} Setting number of participants at once to {participants} Setting OMP_NUM_THREADS to {omp_threads} Setting MKL_NUM_THREADS to 1 Setting ANTS/ITK thread usage to {ants_threads} Maximum potential number of cores that might be used during this run: {max_cores} """ execution_info = """ End of subject workflow {workflow} CPAC run complete: Pipeline configuration: {pipeline} Subject workflow: {workflow} Elapsed run time (minutes): {elapsed} Timing information saved in {log_dir}/cpac_individual_timing_{pipeline}.csv System time of start: {run_start} System time of completion: {run_finish} """ logger.info(information.format( cpac_version=CPAC.__version__, cores=c.pipeline_setup['system_config']['max_cores_per_participant'], participants=c.pipeline_setup['system_config'][ 'num_participants_at_once'], omp_threads=c.pipeline_setup['system_config']['num_OMP_threads'], ants_threads=c.pipeline_setup['system_config']['num_ants_threads'], max_cores=max_core_usage )) subject_info = {} subject_info['subject_id'] = subject_id subject_info['start_time'] = pipeline_start_time check_centrality_degree = c.network_centrality['run'] and \ (len(c.network_centrality['degree_centrality'][ 'weight_options']) != 0 or \ len(c.network_centrality[ 'eigenvector_centrality'][ 'weight_options']) != 0) check_centrality_lfcd = c.network_centrality['run'] and \ len(c.network_centrality[ 'local_functional_connectivity_density'][ 'weight_options']) != 0 # Check system dependencies check_ica_aroma = c.nuisance_corrections['1-ICA-AROMA']['run'] if isinstance(check_ica_aroma, list): check_ica_aroma = True in check_ica_aroma check_system_deps(check_ants='ANTS' in c.registration_workflows[ 'anatomical_registration']['registration']['using'], check_ica_aroma=check_ica_aroma, check_centrality_degree=check_centrality_degree, check_centrality_lfcd=check_centrality_lfcd) # absolute paths of the dirs c.pipeline_setup['working_directory']['path'] = os.path.abspath( c.pipeline_setup['working_directory']['path']) if 's3://' not in c.pipeline_setup['output_directory']['path']: c.pipeline_setup['output_directory']['path'] = os.path.abspath( c.pipeline_setup['output_directory']['path']) workflow = build_workflow( subject_id, sub_dict, c, p_name, num_ants_cores ) if test_config: logger.info('This has been a test of the pipeline configuration ' 'file, the pipeline was built successfully, but was ' 'not run') else: working_dir = os.path.join( c.pipeline_setup['working_directory']['path'], workflow.name) # if c.write_debugging_outputs: # with open(os.path.join(working_dir, 'resource_pool.pkl'), 'wb') as f: # pickle.dump(strat_list, f) # if c.pipeline_setup['working_directory']['regenerate_outputs'] is True: # erasable = list(find_files(working_dir, '*sink*')) + \ # list(find_files(working_dir, '*link*')) + \ # list(find_files(working_dir, '*log*')) # for f in erasable: # if os.path.isfile(f): # os.remove(f) # else: # shutil.rmtree(f) if hasattr(c, 'trim') and c.trim: logger.warn(""" Trimming is an experimental feature, and if used wrongly, it can lead to unreproducible results. It is useful for performance optimization, but only if used correctly. Please, make yourself aware of how it works and its assumptions: - The pipeline configuration has not changed; - The data configuration / BIDS directory has not changed; - The files from the output directory has not changed; - Your softwares versions has not changed; - Your C-PAC version has not changed; - You do not have access to the working directory. """) workflow, _ = the_trimmer( workflow, output_dir=c.pipeline_setup['output_directory']['path'], s3_creds_path=input_creds_path, ) pipeline_start_datetime = strftime("%Y-%m-%d %H:%M:%S") try: subject_info['resource_pool'] = [] # for strat_no, strat in enumerate(strat_list): # strat_label = 'strat_%d' % strat_no # subject_info[strat_label] = strat.get_name() # subject_info['resource_pool'].append(strat.get_resource_pool()) subject_info['status'] = 'Running' # Create callback logger cb_log_filename = os.path.join(log_dir, 'callback.log') try: if not os.path.exists(os.path.dirname(cb_log_filename)): os.makedirs(os.path.dirname(cb_log_filename)) except IOError: pass # Add handler to callback log file cb_logger = cb_logging.getLogger('callback') cb_logger.setLevel(cb_logging.DEBUG) handler = cb_logging.FileHandler(cb_log_filename) cb_logger.addHandler(handler) # Log initial information from all the nodes log_nodes_initial(workflow) # Add status callback function that writes in callback log if nipype.__version__ not in ('1.5.1'): err_msg = "This version of Nipype may not be compatible with " \ "CPAC v%s, please install Nipype version 1.5.1\n" \ % (CPAC.__version__) logger.error(err_msg) else: plugin_args['status_callback'] = log_nodes_cb if plugin_args['n_procs'] == 1: plugin = 'Linear' try: # Actually run the pipeline now, for the current subject workflow.run(plugin=plugin, plugin_args=plugin_args) except UnicodeDecodeError: raise EnvironmentError( "C-PAC migrated from Python 2 to Python 3 in v1.6.2 (see " "release notes). Your working directory contains Python 2 " "pickles, probably from an older version of C-PAC. If you " "want to continue to use this working directory, run\n\n" "docker run -i --rm --user $(id -u):$(id -g) " "-v /path/to/working_dir:/working " "fcpindi/c-pac:latest /bids_dir /outputs cli -- " "utils repickle /working\n" "\nor\n\n" "singularity run " "C-PAC_latest.sif /bids_dir /outputs cli -- " "utils repickle /path/to/working_dir\n\n" "before running C-PAC >=v1.6.2" ) # PyPEER kick-off # if c.PyPEER['run']: # from CPAC.pypeer.peer import prep_for_pypeer # prep_for_pypeer(c.PyPEER['eye_scan_names'], c.PyPEER['data_scan_names'], # c.PyPEER['eye_mask_path'], c.pipeline_setup['output_directory']['path'], subject_id, # pipeline_ids, c.PyPEER['stimulus_path'], c.PyPEER['minimal_nuisance_correction']['peer_gsr'], # c.PyPEER['minimal_nuisance_correction']['peer_scrub'], c.PyPEER['minimal_nuisance_correction']['scrub_thresh']) # Dump subject info pickle file to subject log dir subject_info['status'] = 'Completed' subject_info_file = os.path.join( log_dir, 'subject_info_%s.pkl' % subject_id ) with open(subject_info_file, 'wb') as info: pickle.dump(list(subject_info), info) # have this check in case the user runs cpac_runner from terminal and # the timing parameter list is not supplied as usual by the GUI if pipeline_timing_info != None: # pipeline_timing_info list: # [0] - unique pipeline ID # [1] - pipeline start time stamp (first click of 'run' from GUI) # [2] - number of subjects in subject list unique_pipeline_id = pipeline_timing_info[0] pipeline_start_stamp = pipeline_timing_info[1] num_subjects = pipeline_timing_info[2] # elapsed time data list: # [0] - elapsed time in minutes elapsed_time_data = [] elapsed_time_data.append( int(((time.time() - pipeline_start_time) / 60))) # elapsedTimeBin list: # [0] - cumulative elapsed time (minutes) across all subjects # [1] - number of times the elapsed time has been appended # (effectively a measure of how many subjects have run) # TODO # write more doc for all this # warning in .csv that some runs may be partial # code to delete .tmp file timing_temp_file_path = os.path.join( c.pipeline_setup['log_directory']['path'], '%s_pipeline_timing.tmp' % unique_pipeline_id) if not os.path.isfile(timing_temp_file_path): elapsedTimeBin = [] elapsedTimeBin.append(0) elapsedTimeBin.append(0) with open(timing_temp_file_path, 'wb') as handle: pickle.dump(elapsedTimeBin, handle) with open(timing_temp_file_path, 'rb') as handle: elapsedTimeBin = pickle.loads(handle.read()) elapsedTimeBin[0] = elapsedTimeBin[0] + elapsed_time_data[0] elapsedTimeBin[1] = elapsedTimeBin[1] + 1 with open(timing_temp_file_path, 'wb') as handle: pickle.dump(elapsedTimeBin, handle) # this happens once the last subject has finished running! if elapsedTimeBin[1] == num_subjects: pipelineTimeDict = {} pipelineTimeDict['Pipeline'] = c.pipeline_setup[ 'pipeline_name'] pipelineTimeDict['Cores_Per_Subject'] = \ c.pipeline_setup['system_config'][ 'max_cores_per_participant'] pipelineTimeDict['Simultaneous_Subjects'] = \ c.pipeline_setup['system_config'][ 'num_participants_at_once'] pipelineTimeDict['Number_of_Subjects'] = num_subjects pipelineTimeDict['Start_Time'] = pipeline_start_stamp pipelineTimeDict['End_Time'] = strftime( "%Y-%m-%d_%H:%M:%S") pipelineTimeDict['Elapsed_Time_(minutes)'] = \ elapsedTimeBin[0] pipelineTimeDict['Status'] = 'Complete' gpaTimeFields = [ 'Pipeline', 'Cores_Per_Subject', 'Simultaneous_Subjects', 'Number_of_Subjects', 'Start_Time', 'End_Time', 'Elapsed_Time_(minutes)', 'Status' ] timeHeader = dict(zip(gpaTimeFields, gpaTimeFields)) with open(os.path.join( c.pipeline_setup['log_directory']['path'], 'cpac_individual_timing_%s.csv' % c.pipeline_setup['pipeline_name'] ), 'a') as timeCSV, open(os.path.join( c.pipeline_setup['log_directory']['path'], 'cpac_individual_timing_%s.csv' % c.pipeline_setup['pipeline_name'] ), 'r') as readTimeCSV: timeWriter = csv.DictWriter(timeCSV, fieldnames=gpaTimeFields) timeReader = csv.DictReader(readTimeCSV) headerExists = False for line in timeReader: if 'Start_Time' in line: headerExists = True if headerExists == False: timeWriter.writerow(timeHeader) timeWriter.writerow(pipelineTimeDict) # remove the temp timing file now that it is no longer needed os.remove(timing_temp_file_path) # Upload logs to s3 if s3_str in output directory if c.pipeline_setup['output_directory'][ 'path'].lower().startswith('s3://'): try: # Store logs in s3 output director/logs/... s3_log_dir = os.path.join( c.pipeline_setup['output_directory']['path'], 'logs', os.path.basename(log_dir) ) bucket_name = \ c.pipeline_setup['output_directory']['path'].split('/')[2] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Collect local log files local_log_files = [] for root, _, files in os.walk(log_dir): local_log_files.extend([os.path.join(root, fil) for fil in files]) # Form destination keys s3_log_files = [loc.replace(log_dir, s3_log_dir) for loc in local_log_files] # Upload logs aws_utils.s3_upload(bucket, (local_log_files, s3_log_files), encrypt=encrypt_data) # Delete local log files for log_f in local_log_files: os.remove(log_f) except Exception as exc: err_msg = 'Unable to upload CPAC log files in: %s.\nError: %s' logger.error(err_msg, log_dir, exc) except Exception as e: import traceback; traceback.print_exc() execution_info = """ Error of subject workflow {workflow} CPAC run error: Pipeline configuration: {pipeline} Subject workflow: {workflow} Elapsed run time (minutes): {elapsed} Timing information saved in {log_dir}/cpac_individual_timing_{pipeline}.csv System time of start: {run_start} """ finally: if workflow: resource_report(cb_log_filename, num_cores_per_sub, logger) logger.info(execution_info.format( workflow=workflow.name, pipeline=c.pipeline_setup['pipeline_name'], log_dir=c.pipeline_setup['log_directory']['path'], elapsed=(time.time() - pipeline_start_time) / 60, run_start=pipeline_start_datetime, run_finish=strftime("%Y-%m-%d %H:%M:%S") )) # Remove working directory when done if c.pipeline_setup['working_directory'][ 'remove_working_dir']: try: if os.path.exists(working_dir): logger.info("Removing working dir: %s", working_dir) shutil.rmtree(working_dir) except (FileNotFoundError, PermissionError): logger.warn('Could not remove working directory %s', working_dir)
def test_bucket_access(creds_path, output_directory): """ Function to test write-access to an S3 bucket. Parameters ---------- :param creds_path : string path to the csv file downloaded from AWS; can either be root or user credentials :param output_directory : string directory to path on S3 where write-access should be tested; e.g. 's3://bucket_name/path/to/outputdir' Returns ------- :return: s3_write_access : boolean flag indicating whether user credentials grant write-access to specified output directory in S3 bucket """ # Import packages import os import tempfile import botocore.exceptions as bexc from indi_aws import fetch_creds # Init variables s3_str = 's3://' test_file = tempfile.mktemp() # Explicitly lower-case the "s3" if output_directory.lower().startswith(s3_str): out_dir_sp = output_directory.split('/') out_dir_sp[0] = out_dir_sp[0].lower() output_directory = '/'.join(out_dir_sp) # Get bucket name bucket_name = output_directory.replace(s3_str, '').split('/')[0] # Get bucket bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Create local file with open(test_file, 'w') as f: f.write('test123') f.close() # Formulate test ouput key in bucket path output directory rel_key_path = output_directory.replace(os.path.join(s3_str, bucket_name), '').lstrip('/') write_test_key = os.path.join(rel_key_path, os.path.basename(test_file)) # Attempt a write to bucket try: bucket.upload_file(test_file, write_test_key) print('S3 write access confirmed!') test_key = bucket.Object(key=write_test_key) test_key.delete() s3_write_access = True # Otherwise we set the access flag to false except bexc.ClientError: print('S3 write access is not available!') s3_write_access = False # Return the access flag return s3_write_access
def testConfig(self, event): ''' This function runs when the user clicks the "Test Configuration" button in the pipeline configuration window. It prompts the user for a sample subject list (i.e. one that they will be using with the config they are building). Then it builds the pipeline but does not run it. It then reports whether or not the config will run or not depending on if the pipeline gets built successfully. ''' # Import packages import os import yaml from CPAC.utils import Configuration from CPAC.pipeline.cpac_pipeline import prep_workflow from CPAC.pipeline.cpac_runner import build_strategies def display(win, msg, changeBg=True): wx.MessageBox(msg, "Error") if changeBg: win.SetBackgroundColour("pink") win.SetFocus() win.Refresh() # Collect a sample subject list and parse it in testDlg0 = wx.MessageDialog( self, 'This tool will run a quick check on the current pipeline '\ 'configuration. Click OK to provide a subject list you ' \ 'will be using with this setup.', 'Subject List', wx.OK | wx.ICON_INFORMATION) testDlg0.ShowModal() testDlg0.Destroy() dlg = wx.FileDialog(self, message="Choose the CPAC Subject list file", defaultDir=os.getcwd(), defaultFile="CPAC_subject_list.yml", wildcard="YAML files(*.yaml, *.yml)|*.yaml;*.yml", style=wx.OPEN | wx.CHANGE_DIR) if dlg.ShowModal() == wx.ID_OK: subListPath = dlg.GetPath() # Load and test the subject list print 'Checking subject list: %s...' % subListPath sublist = yaml.load(open(os.path.realpath(subListPath), 'r')) sub_flg = self.test_sublist(sublist) if not sub_flg: raise Exception print 'Subject list looks good!' # Following code reads in the parameters and selections from the # pipeline configuration window and populate the config_list config_list = [] wf_counter = [] for page in self.nb.get_page_list(): switch = page.page.get_switch() ctrl_list = page.page.get_ctrl_list() validate = False if switch: switch_val = str(switch.get_selection()).lower() if switch_val == 'on' or switch_val == 'true' or \ switch_val == '1': validate = True wf_counter.append(page.get_counter()) for ctrl in ctrl_list: # option_name will be the selection name as it is written # as the dictionary key of the config.yml dictionary option_name = ctrl.get_name() #validating if (switch == None or validate) and ctrl.get_validation() \ and (option_name != 'derivativeList') and \ (option_name != 'modelConfigs'): win = ctrl.get_ctrl() if isinstance(ctrl.get_selection(), list): value = ctrl.get_selection() if not value: display( win, "%s field is empty or the items are " \ "not checked!" % ctrl.get_name(), False) return elif (option_name == "tsa_roi_paths") or \ (option_name == "sca_roi_paths"): # fires if the control is the checkbox grid for # multiple paths assigned to multiple options # (i.e. timeseries analysis) config_list.append(ctrl) continue else: value = str(ctrl.get_selection()) if len(value) == 0: display(win, "%s field is empty!" % ctrl.get_name()) return if '/' in value and '$' not in value and not \ isinstance(value, list): if not os.path.exists(ctrl.get_selection()) and \ value != 'On/Off': display( win, "%s field contains incorrect path. " \ "Please update the path!" % ctrl.get_name()) return config_list.append(ctrl) # Write out a pipeline_config file, read it in and then delete it # (Will revise the data structure of the config files later so this # can just pass the data structure instead of doing it this way) try: test_cfg_yml = '/tmp/test_config.yml' self.write(test_cfg_yml, config_list) c = Configuration( yaml.load(open(os.path.realpath(test_cfg_yml), 'r'))) os.remove(test_cfg_yml) except: errDlg2 = wx.MessageDialog( self, 'A problem occurred with preparing the pipeline test run. \n\n' \ 'Please ensure you have rights access to the directories you' \ ' have chosen for the CPAC working, crash, and output folders.', 'Test Configuration Error', wx.OK | wx.ICON_ERROR) errDlg2.ShowModal() errDlg2.Destroy() if (1 in c.runNuisance) or (c.Regressors != None): strategies = sorted(build_strategies(c)) else: strategies = None # Run the actual pipeline building prep and see if it works or not testDlg1 = wx.MessageDialog( self, 'Click OK to run the test. This should take only a few seconds.', 'Running Test', wx.OK | wx.ICON_INFORMATION) testDlg1.ShowModal() # Check file paths first # Just getting proper names of config file parameters try: params_file = open( p.resource_filename('CPAC', 'GUI/resources/config_parameters.txt'), "r") except: print "Error: Could not open configuration parameter file.", "\n" raise Exception paramInfo = params_file.read().split('\n') paramList = [] for param in paramInfo: if param != '': paramList.append(param.split(',')) # function for file path checking def testFile(filepath, paramName, switch): try: if (1 in switch) and (filepath != None): fileTest = open(filepath) fileTest.close() except: testDlg1.Destroy() for param in paramList: if param[0] == paramName: paramTitle = param[1] paramGroup = param[2] break errDlgFileTest = wx.MessageDialog( self, 'Error reading file - either it does not exist or '\ 'you do not have read access. \n\n' \ 'Parameter: %s \n' \ 'In tab: %s \n\n' \ 'Path: %s' % (paramTitle, paramGroup, filepath), 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlgFileTest.ShowModal() errDlgFileTest.Destroy() # Check S3 output bucket access if writing to S3 output_dir = c.outputDirectory s3_str = 's3://' if output_dir.lower().startswith(s3_str): output_dir_sp = output_dir.split('/') output_dir_sp[0] = output_dir_sp[0].lower() output_dir = '/'.join(output_dir_sp) if type(output_dir) is str and output_dir.lower().startswith(s3_str): from indi_aws import fetch_creds creds_path = c.awsOutputBucketCredentials bucket_name = output_dir.split(s3_str)[1].split('/')[0] try: bucket = fetch_creds.return_bucket(creds_path, bucket_name) print 'Connection with output bucket "%s" successful!' % bucket_name except Exception as exc: err_msg = 'Unable to access output S3 bucket: "%s" with '\ 'credentials in: "%s". Check bucket name '\ 'and credentials file and try again'\ % (bucket_name, creds_path) testDlg1.Destroy() errDlg1 = wx.MessageDialog(self, err_msg, 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlg1.ShowModal() errDlg1.Destroy() return testFile(c.template_brain_only_for_anat, \ 'template_brain_only_for_anat',[1]) testFile(c.template_skull_for_anat, 'template_skull_for_anat', [1]) testFile(c.PRIORS_WHITE, 'PRIORS_WHITE', c.runSegmentationPreprocessing) testFile(c.PRIORS_GRAY, 'PRIORS_GRAY', c.runSegmentationPreprocessing) testFile(c.PRIORS_CSF, 'PRIORS_CSF', c.runSegmentationPreprocessing) testFile(c.template_brain_only_for_func, \ 'template_brain_only_for_func',c.runRegisterFuncToMNI) testFile(c.template_skull_for_func,'template_skull_for_func', \ c.runRegisterFuncToMNI) testFile(c.identityMatrix, 'identityMatrix', c.runRegisterFuncToMNI) testFile(c.boundaryBasedRegistrationSchedule, \ 'boundaryBasedRegistrationSchedule', \ c.runRegisterFuncToAnat) testFile(c.lateral_ventricles_mask,'lateral_ventricles_mask', \ c.runNuisance) testFile(c.template_symmetric_brain_only, \ 'template_symmetric_brain_only',c.runVMHC) testFile(c.template_symmetric_skull,'template_symmetric_skull', \ c.runVMHC) testFile(c.dilated_symmetric_brain_mask, \ 'dilated_symmetric_brain_mask',c.runVMHC) testFile(c.configFileTwomm, 'configFileTwomm', c.runVMHC) testFile(c.templateSpecificationFile,'templateSpecificationFile', \ c.runNetworkCentrality) if c.tsa_roi_paths and type(c.tsa_roi_paths[0]) == dict: for roi_path in c.tsa_roi_paths[0].keys(): testFile(roi_path, "tsa_roi_paths", c.runROITimeseries) if c.sca_roi_paths and type(c.sca_roi_paths[0]) == dict: for roi_path in c.sca_roi_paths[0].keys(): testFile(roi_path, "sca_roi_paths", c.runSCA) try: # Run the pipeline building prep_workflow(sublist[0], c, strategies, 0) except Exception as xxx: print xxx print "an exception occurred" testDlg1.Destroy() errDlg1 = wx.MessageDialog( self, 'There are issues with the current configuration ' \ 'which need to be resolved - please check to make ' \ 'sure the options you are running have the proper ' \ 'pre-requisites selected.\n\nIssue Info:\n%s' \ % str(xxx), 'Pipeline Not Ready', wx.OK | wx.ICON_ERROR) errDlg1.ShowModal() errDlg1.Destroy() else: testDlg1.Destroy() okDlg1 = wx.MessageDialog( self, 'The current configuration will run successfully. You '\ 'can safely save and run this setup!', 'Pipeline Ready', wx.OK | wx.ICON_INFORMATION) okDlg1.ShowModal() okDlg1.Destroy()
def check_for_s3(file_path, creds_path=None, dl_dir=None, img_type='other'): # Import packages import os import nibabel as nib import botocore.exceptions from indi_aws import fetch_creds # Init variables s3_str = 's3://' if creds_path: if "None" in creds_path or "none" in creds_path or \ "null" in creds_path: creds_path = None if dl_dir is None: dl_dir = os.getcwd() if file_path is None: # in case it's something like scan parameters or field map files, but # we don't have any return None # TODO: remove this once scan parameter input as dictionary is phased out if isinstance(file_path, dict): # if this is a dictionary, just skip altogether local_path = file_path return local_path # Explicitly lower-case the "s3" if file_path.lower().startswith(s3_str): file_path = s3_str + file_path[len(s3_str):] # Get bucket name and bucket object bucket_name = file_path[len(s3_str):].split('/')[0] bucket = fetch_creds.return_bucket(creds_path, bucket_name) # Extract relative key path from bucket and local path s3_prefix = s3_str + bucket_name s3_key = file_path[len(s3_prefix) + 1:] local_path = os.path.join(dl_dir, bucket_name, s3_key) # Get local directory and create folders if they dont exist local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) # Download file try: print("Attempting to download from AWS S3: {0}".format(file_path)) bucket.download_file(Key=s3_key, Filename=local_path) except botocore.exceptions.ClientError as exc: error_code = int(exc.response['Error']['Code']) err_msg = str(exc) if error_code == 403: err_msg = 'Access to bucket: "%s" is denied; using credentials '\ 'in subject list: "%s"; cannot access the file "%s"'\ % (bucket_name, creds_path, file_path) elif error_code == 404: err_msg = 'File: {0} does not exist; check spelling and try '\ 'again'.format(os.path.join(bucket_name, s3_key)) else: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) raise Exception(err_msg) except Exception as exc: err_msg = 'Unable to connect to bucket: "%s". Error message:\n%s'\ % (bucket_name, exc) raise Exception(err_msg) # Otherwise just return what was passed in else: local_path = file_path # Check if it exists or it is sucessfuly downloaded if not os.path.exists(local_path): raise IOError('File %s does not exists!' % (local_path)) # Check image dimensionality if local_path.endswith('.nii') or local_path.endswith('.nii.gz'): img_nii = nib.load(local_path) if img_type == 'anat': if len(img_nii.shape) != 3: raise IOError('File: %s must be an anatomical image with 3 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) elif img_type == 'func': if len(img_nii.shape) != 4: raise IOError('File: %s must be a functional image with 4 '\ 'dimensions but %d dimensions found!' % (local_path, len(img_nii.shape))) return local_path
s3_bucket = "fcp-indi" s3_creds = "/Users/cameron.craddock/AWS/ccraddock-fcp-indi-keys2.csv" s3_prefix = "data/Projects/ADHD200/RawDataBIDS" s3_sitedirs = [ "Brown", "KKI", "NeuroIMAGE", "NYU", "OHSU", "Peking_1", "Peking_2", "Peking_3", "Pittsburgh", "WashU" ] out_prefix = "data/ADHD200/RawDataBIDS" max_subjs = 4 if s3_creds: if not os.path.isfile(s3_creds): raise IOError("Could not filed aws_input_creds (%s)" % (s3_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(s3_creds, s3_bucket) for site in s3_sitedirs: subjects = [] prefix = os.path.join(s3_prefix, site) print "gathering files from S3 bucket (%s) for %s" % (bucket, prefix) for s3_obj in bucket.objects.filter(Prefix=prefix): if 'T1w' in str(s3_obj.key) or 'bold' in str(s3_obj.key): fname = os.path.basename(str(s3_obj.key)) if "sub-" not in fname: if not os.path.exists( os.path.dirname(s3_obj.key).replace( s3_prefix, out_prefix)): print "making the directory"
def collect_bids_files_configs(bids_dir, aws_input_creds=''): """ :param bids_dir: :param aws_input_creds: :return: """ file_paths = [] config_dict = {} if bids_dir.lower().startswith("s3://"): # s3 paths begin with s3://bucket/ bucket_name = bids_dir.split('/')[2] s3_prefix = '/'.join(bids_dir.split('/')[:3]) prefix = bids_dir.replace(s3_prefix, '').lstrip('/') if aws_input_creds: if not os.path.isfile(aws_input_creds): raise IOError("Could not find aws_input_creds (%s)" % (aws_input_creds)) from indi_aws import fetch_creds bucket = fetch_creds.return_bucket(aws_input_creds, bucket_name) print "gathering files from S3 bucket (%s) for %s" % (bucket, prefix) for s3_obj in bucket.objects.filter(Prefix=prefix): # we only know how to handle T1w and BOLD files, for now if 'T1w' in str(s3_obj.key) or 'bold' in str(s3_obj.key): if str(s3_obj.key).endswith("json"): try: config_dict[s3_obj.key.replace(prefix, "").lstrip('/')] \ = json.loads(s3_obj.get()["Body"].read()) except Exception as e: print ("Error retrieving %s (%s)" % (s3_obj.key.replace(prefix, ""), e.message)) raise elif 'nii' in str(s3_obj.key): file_paths.append(str(s3_obj.key) .replace(prefix,'').lstrip('/')) else: for root, dirs, files in os.walk(bids_dir, topdown=False): if files: file_paths += [os.path.join(root, f).replace(bids_dir,'') .lstrip('/') for f in files if 'nii' in f and ('T1w' in f or 'bold' in f)] config_dict.update( {os.path.join(root.replace(bids_dir, '').lstrip('/'), f): json.load(open(os.path.join(root, f), 'r')) for f in files if f.endswith('json') and ('T1w' in f or 'bold' in f)}) if not file_paths and not config_dict: raise IOError("Didn't find any files in %s. Please verify that the" " path is typed correctly, that you have read access to the" " directory, and that it is not empty.".format(bids_dir)) return file_paths, config_dict
dryrun = True # Prefixes for reference files to copy from. peerone = 'data/Projects/RocklandSample/RawDataBIDS/sub-A00064081/ses-NFB3/func/sub-A00064081_ses-NFB3_task-PEER1_events.tsv' peertwo = 'data/Projects/RocklandSample/RawDataBIDS/sub-A00064081/ses-NFB3/func/sub-A00064081_ses-NFB3_task-PEER2_events.tsv' checkerboardone = 'data/Projects/RocklandSample/RawDataBIDS/sub-A00064416/ses-DSA/func/sub-A00064416_ses-DSA_task-CHECKERBOARD_acq-1400_events.tsv' checkerboardtwo = 'data/Projects/RocklandSample/RawDataBIDS/sub-A00064416/ses-DSA/func/sub-A00064416_ses-DSA_task-CHECKERBOARD_acq-645_events.tsv' breathhold = 'data/Projects/RocklandSample/RawDataBIDS/sub-A00064416/ses-DSA/func/sub-A00064416_ses-DSA_task-BREATHHOLD_acq-1400_events.tsv' # Create bucket object s3_bucket_name = 'fcp-indi' s3_prefix = 'data/Projects/RocklandSample/RawDataBIDS' s3 = boto3.resource('s3') s3_creds_path = '/path/to/jpellman-fcp-indi-keys.csv' bucket = fetch_creds.return_bucket(s3_creds_path, s3_bucket_name) s3_keys = bucket.objects.filter(Prefix=s3_prefix) # Get the keys for NifTIs without events TSVs. keylist = [key.key for key in s3_keys] peerone_keylist = [ key for key in keylist if 'PEER1' in key and '.nii.gz' in key and key.replace('_bold.nii.gz', '_events.tsv') not in keylist ] peertwo_keylist = [ key for key in keylist if 'PEER2' in key and '.nii.gz' in key and key.replace('_bold.nii.gz', '_events.tsv') not in keylist ] checkerboardone_keylist = [ key for key in keylist if 'CHECKERBOARD_acq-1400' in key and '.nii.gz' in key