def setUp(self): ''' Method to instantiate input arguments for the cpac_pipeline.run() method via instance attributes Parameters ---------- self : CPACPippelineRunTestCase a unittest.TestCase-inherited class Returns ------- None this function does not return any values, but populates the instance attributes for: self.config_file : string self.sublist_file : string self.idx : integer self.config : CPAC.utils.configuration.Configuration object self.strategies : list [dict] ''' # Import packages import os import yaml from CPAC.utils.configuration import Configuration # Init variables self.config_file = PIPELINE_CONFIG self.sublist_file = SUBJECT_LIST self.idx = 1 # Init Configuration class from config_file self.config = Configuration(yaml.load(open(self.config_file, 'r'))) self.strategies = STRAT_FILE
def run(config, subject_infos): import subprocess subprocess.getoutput('source ~/.bashrc') import os import pickle import yaml import yamlordereddictloader c = Configuration(yaml.safe_load(open(os.path.realpath(config), 'r'))) prep_cwas_workflow(c, pickle.load(open(subject_infos, 'r')))
def test_motion_estimates_and_correction(run_value): '''Test that any truthy forkable option for 'run' throws the custom human-readable exception for an invalid motion_estimate_filter. ''' d = { 'FROM': 'default', 'functional_preproc': { 'motion_estimates_and_correction': { 'motion_estimate_filter': { 'run': run_value, 'filter_type': 'notch', 'filter_order': 0, 'breathing_rate_min': None, 'breathing_rate_max': 101.5 } } } } if bool(run_value) and run_value not in [[False], []]: with pytest.raises(Invalid) as e: Configuration(d) assert "func#motion_estimate_filter_valid_options" in str(e.value) else: Configuration(d)
def runAnalysis1(self, pipeline, sublist, p): import CPAC.pipeline.cpac_runner from CPAC.utils import Configuration import yamlordereddictloader c = Configuration( yaml.load(open(os.path.realpath(pipeline), 'r'), Loader=yamlordereddictloader.Loader)) plugin_args = { 'n_procs': c.maxCoresPerParticipant, 'memory_gb': c.maximumMemoryPerParticipant } # TODO: make this work if self.pids: #print "THERE'S SOMETHING RUNNING!" pass CPAC.pipeline.cpac_runner.run(sublist, pipeline, p, plugin='MultiProc', plugin_args=plugin_args)
def test_trimmer(): from CPAC.utils.trimmer import the_trimmer, is_datasink, expand_workflow, compute_datasink_dirs from CPAC.pipeline.cpac_pipeline import build_workflow from CPAC.utils.configuration import Configuration import os import pkg_resources as p pipe_config = \ p.resource_filename( "CPAC", os.path.join( "resources", "configs", "pipeline_config_template.yml" ) ) data_config = \ p.resource_filename( "CPAC", os.path.join( "resources", "configs", "data_config_S3-BIDS-ABIDE.yml" ) ) data_config = yaml.safe_load(open(data_config, 'r')) sub_dict = data_config[0] c = Configuration(yaml.safe_load(open(pipe_config, 'r'))) temp_dir = tempfile.mkdtemp() c.logDirectory = temp_dir c.workingDirectory = temp_dir c.outputDirectory = temp_dir c.crashLogDirectory = temp_dir # Disable functional, let only the anatomical workflow c_anatomical = copy(c) c_anatomical.runFunctional = [0] wf, _, _ = build_workflow(sub_dict['subject_id'], sub_dict, c_anatomical) # Create fake files to trick THE TRIMMER exec_graph = expand_workflow(wf) datasinks = [n for n in exec_graph.nodes() if is_datasink(n)] anat_derivatives = {} for datasink in datasinks: paths = compute_datasink_dirs(exec_graph, datasink) anat_derivatives.update(paths) for (node, derivative), path in paths.items(): os.makedirs(path) open(os.path.join(path, '%s.txt' % derivative), 'a').close() # Enable functional, so the workflow should only run this # and enable trimming c_functional = copy(c) c_functional.runFunctional = [1] wf, _, _ = build_workflow(sub_dict['subject_id'], sub_dict, c_functional) exec_wf, _ = the_trimmer(wf) exec_graph = exec_wf._graph datasinks = [n for n in exec_graph.nodes() if is_datasink(n)] func_derivatives = {} for datasink in datasinks: paths = compute_datasink_dirs(exec_graph, datasink) func_derivatives.update(paths) # Assert that the functional pipeline remove all the anatomical nodes, # as they were already computed assert set(func_derivatives.keys()).intersection( set(anat_derivatives.keys())) == set()
def run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir): ''' Function to build a SLURM batch job submission script and submit it to the scheduler via 'sbatch' ''' # Import packages import subprocess import getpass import re from time import strftime from CPAC.utils import Configuration from indi_schedulers import cluster_templates # Load in pipeline config try: pipeline_dict = yaml.safe_load(open(os.path.realpath(config_file), 'r')) pipeline_config = Configuration(pipeline_dict) except: raise Exception('Pipeline config is not in proper YAML format. '\ 'Please check your file') # Load in the subject list try: sublist = yaml.safe_load(open(os.path.realpath(subject_list_file), 'r')) except: raise Exception('Subject list is not in proper YAML format. '\ 'Please check your file') # Init variables timestamp = str(strftime("%Y_%m_%d_%H_%M_%S")) job_scheduler = pipeline_config.resourceManager.lower() # For SLURM time limit constraints only, hh:mm:ss hrs_limit = 8 * len(sublist) time_limit = '%d:00:00' % hrs_limit # Batch file variables shell = subprocess.getoutput('echo $SHELL') user_account = getpass.getuser() num_subs = len(sublist) # Run CPAC via python -c command python_cpac_str = 'python -c "from CPAC.pipeline.cpac_pipeline import run; '\ 'run(\'%(config_file)s\', \'%(subject_list_file)s\', '\ '%(env_arr_idx)s, \'%(pipeline_name)s\', '\ 'plugin=\'MultiProc\', plugin_args=%(plugin_args)s)"' # Init plugin arguments plugin_args = {'n_procs': pipeline_config.maxCoresPerParticipant, 'memory_gb': pipeline_config.maximumMemoryPerParticipant} # Set up run command dictionary run_cmd_dict = {'config_file' : config_file, 'subject_list_file' : subject_list_file, 'pipeline_name' : pipeline_config.pipelineName, 'plugin_args' : plugin_args} # Set up config dictionary config_dict = {'timestamp' : timestamp, 'shell' : shell, 'job_name' : 'CPAC_' + pipeline_config.pipelineName, 'num_tasks' : num_subs, 'queue' : pipeline_config.queue, 'par_env' : pipeline_config.parallelEnvironment, 'cores_per_task' : pipeline_config.maxCoresPerParticipant, 'user' : user_account, 'work_dir' : cluster_files_dir, 'time_limit' : time_limit} # Get string template for job scheduler if job_scheduler == 'pbs': env_arr_idx = '$PBS_ARRAYID' batch_file_contents = cluster_templates.pbs_template confirm_str = '(?<=Your job-array )\d+' exec_cmd = 'qsub' elif job_scheduler == 'sge': env_arr_idx = '$SGE_TASK_ID' batch_file_contents = cluster_templates.sge_template confirm_str = '(?<=Your job-array )\d+' exec_cmd = 'qsub' elif job_scheduler == 'slurm': env_arr_idx = '$SLURM_ARRAY_TASK_ID' batch_file_contents = cluster_templates.slurm_template confirm_str = '(?<=Submitted batch job )\d+' exec_cmd = 'sbatch' # Populate rest of dictionary config_dict['env_arr_idx'] = env_arr_idx run_cmd_dict['env_arr_idx'] = env_arr_idx config_dict['run_cmd'] = python_cpac_str % run_cmd_dict # Populate string from config dict values batch_file_contents = batch_file_contents % config_dict # Write file batch_filepath = os.path.join(cluster_files_dir, 'cpac_submit_%s.%s' \ % (timestamp, job_scheduler)) with open(batch_filepath, 'w') as f: f.write(batch_file_contents) # Get output response from job submission out = subprocess.getoutput('%s %s' % (exec_cmd, batch_filepath)) # Check for successful qsub submission if re.search(confirm_str, out) == None: err_msg = 'Error submitting C-PAC pipeline run to %s queue' \ % job_scheduler raise Exception(err_msg) # Get pid and send to pid file pid = re.search(confirm_str, out).group(0) pid_file = os.path.join(cluster_files_dir, 'pid.txt') with open(pid_file, 'w') as f: f.write(pid)
def run(subject_list_file, config_file=None, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False): # Import packages import subprocess import os import pickle import time from CPAC.pipeline.cpac_pipeline import run_workflow print('Run called with config file {0}'.format(config_file)) if not config_file: import pkg_resources as p config_file = \ p.resource_filename("CPAC", os.path.join("resources", "configs", "pipeline_config_template.yml")) # Init variables sublist = None if '.yaml' in subject_list_file or '.yml' in subject_list_file: subject_list_file = os.path.realpath(subject_list_file) else: from CPAC.utils.bids_utils import collect_bids_files_configs, \ bids_gen_cpac_sublist (file_paths, config) = collect_bids_files_configs(subject_list_file, None) sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config, None) if not sublist: import sys print("Did not find data in {0}".format(subject_list_file)) sys.exit(1) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file config_file = os.path.realpath(config_file) try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.safe_load(open(config_file, 'r'))) except IOError: print("config file %s doesn't exist" % config_file) raise except yaml.parser.ParserError as e: error_detail = "\"%s\" at line %d" % ( e.problem, e.problem_mark.line ) raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, error_detail) ) except Exception as e: raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, e) ) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation if not c.workingDirectory: raise Exception('Working directory not specified') if len(c.workingDirectory) > 70: warnings.warn("We recommend that the working directory full path " "should have less then 70 characters. " "Long paths might not work in your operational system.") warnings.warn("Current working directory: %s" % c.workingDirectory) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: if not sublist: sublist = yaml.safe_load(open(subject_list_file, 'r')) except: print("Subject list is not in proper YAML format. Please check " \ "your file") raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_'+ str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_'+ str(id)) sub_scan_map[s] = scan_ids except: print("\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n") raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: try: track_run(level='participant', participants=len(sublist)) except: pass # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: run_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=run_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print(p.pid, file=pid) # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc: idc+c.numParticipantsAtOnce]: p.start() print(p.pid, file=pid) job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print('found dead job ', job) loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()
upgrade_pipeline_to_1_8(updated_config) c = load_yaml_config(updated_config, args.aws_input_creds) overrides = {} if hasattr(args, 'pipeline_override') and args.pipeline_override: overrides = { k: v for d in args.pipeline_override for k, v in d.items() } c = update_nested_dict(c, overrides) if args.anat_only: c = update_nested_dict(c, {'FROM': 'anat-only'}) c = Configuration(c) # get the aws_input_credentials, if any are specified if args.aws_input_creds: c['awsCredentialsFile'] = resolve_aws_credential(args.aws_input_creds) if args.aws_output_creds: c['pipeline_setup']['Amazon-AWS'][ 'aws_output_bucket_credentials'] = resolve_aws_credential( args.aws_output_creds) c['pipeline_setup']['output_directory']['path'] = os.path.join( args.output_dir, "output") if "s3://" not in args.output_dir.lower(): c['pipeline_setup']['log_directory']['path'] = os.path.join(
def run(subject_list_file, config_file=None, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False): # Import packages import subprocess import os import pickle import time from CPAC.pipeline.cpac_pipeline import run_workflow print('Run called with config file {0}'.format(config_file)) if not config_file: import pkg_resources as p config_file = \ p.resource_filename("CPAC", os.path.join("resources", "configs", "pipeline_config_template.yml")) # Init variables sublist = None if '.yaml' in subject_list_file or '.yml' in subject_list_file: subject_list_file = os.path.realpath(subject_list_file) else: from CPAC.utils.bids_utils import collect_bids_files_configs, \ bids_gen_cpac_sublist (file_paths, config) = collect_bids_files_configs(subject_list_file, None) sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config, None) if not sublist: import sys print("Did not find data in {0}".format(subject_list_file)) sys.exit(1) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file config_file = os.path.realpath(config_file) try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.safe_load(open(config_file, 'r'))) except IOError: print("config file %s doesn't exist" % config_file) raise except yaml.parser.ParserError as e: error_detail = "\"%s\" at line %d" % (e.problem, e.problem_mark.line) raise Exception("Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, error_detail)) except Exception as e: raise Exception("Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, e)) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation if not c.workingDirectory: raise Exception('Working directory not specified') if len(c.workingDirectory) > 70: warnings.warn("We recommend that the working directory full path " "should have less then 70 characters. " "Long paths might not work in your operational system.") warnings.warn("Current working directory: %s" % c.workingDirectory) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: if not sublist: sublist = yaml.safe_load(open(subject_list_file, 'r')) except: print("Subject list is not in proper YAML format. Please check " \ "your file") raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_' + str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_' + str(id)) sub_scan_map[s] = scan_ids except: print("\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n") raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: try: track_run(level='participant' if not test_config else 'test', participants=len(sublist)) except: print("Usage tracking failed for this run.") # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: # Create working dir if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) ''' if not os.path.exists(c.logDirectory): try: os.makedirs(c.logDirectory) except: err = "\n\n[!] CPAC says: Could not create the log " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.logDirectory raise Exception(err) ''' # BEGIN LONGITUDINAL TEMPLATE PIPELINE if hasattr(c, 'run_longitudinal') and ('anat' in c.run_longitudinal or 'func' in c.run_longitudinal): subject_id_dict = {} for sub in sublist: if sub['subject_id'] in subject_id_dict: subject_id_dict[sub['subject_id']].append(sub) else: subject_id_dict[sub['subject_id']] = [sub] # subject_id_dict has the subject_id as keys and a list of sessions for # each participant as value valid_longitudinal_data = False for subject_id, sub_list in subject_id_dict.items(): if len(sub_list) > 1: valid_longitudinal_data = True if 'func' in c.run_longitudinal: raise Exception( "\n\n[!] Error: Functional longitudinal pipeline is still in development and will be available in next release. Please only run anatomical longitudinal pipeline for now.\n\n" ) if 'anat' in c.run_longitudinal: strat_list = anat_longitudinal_wf( subject_id, sub_list, c) elif len(sub_list) == 1: warnings.warn( "\n\nThere is only one anatomical session for sub-%s. Longitudinal preprocessing will be skipped for this subject.\n\n" % subject_id) # TODO # if 'func' in c.run_longitudinal: # strat_list = func_preproc_longitudinal_wf(subject_id, sub_list, c) # func_longitudinal_template_wf(subject_id, strat_list, c) if valid_longitudinal_data: rsc_file_list = [] for dirpath, dirnames, filenames in os.walk(c.outputDirectory): for f in filenames: # TODO is there a better way to check output folder name? if f != '.DS_Store' and 'pipeline_analysis_longitudinal' in dirpath: rsc_file_list.append(os.path.join(dirpath, f)) subject_specific_dict = { subj: [] for subj in subject_id_dict.keys() } session_specific_dict = { os.path.join(session['subject_id'], session['unique_id']): [] for session in sublist } for rsc_path in rsc_file_list: key = [ s for s in session_specific_dict.keys() if s in rsc_path ] if key: session_specific_dict[key[0]].append(rsc_path) else: subj = [ s for s in subject_specific_dict.keys() if s in rsc_path ] if subj: subject_specific_dict[subj[0]].append(rsc_path) # update individual-specific outputs: # anatomical_brain, anatomical_brain_mask and anatomical_reorient for key in session_specific_dict.keys(): for f in session_specific_dict[key]: sub, ses = key.split('/') ses_list = [ subj for subj in sublist if sub in subj['subject_id'] and ses in subj['unique_id'] ] if len(ses_list) > 1: raise Exception( "There are several files containing " + f) if len(ses_list) == 1: ses = ses_list[0] subj_id = ses['subject_id'] tmp = f.split(c.outputDirectory)[-1] keys = tmp.split(os.sep) if keys[0] == '': keys = keys[1:] if len(keys) > 1: if ses.get('resource_pool') is None: ses['resource_pool'] = { keys[0].split(c.pipelineName + '_')[-1]: { keys[-2]: f } } else: strat_key = keys[0].split(c.pipelineName + '_')[-1] if ses['resource_pool'].get( strat_key) is None: ses['resource_pool'].update( {strat_key: { keys[-2]: f }}) else: ses['resource_pool'][strat_key].update( {keys[-2]: f}) for key in subject_specific_dict: for f in subject_specific_dict[key]: ses_list = [ subj for subj in sublist if key in subj['anat'] ] for ses in ses_list: tmp = f.split(c.outputDirectory)[-1] keys = tmp.split(os.sep) if keys[0] == '': keys = keys[1:] if len(keys) > 1: if ses.get('resource_pool') is None: ses['resource_pool'] = { keys[0].split(c.pipelineName + '_')[-1]: { keys[-2]: f } } else: strat_key = keys[0].split(c.pipelineName + '_')[-1] if ses['resource_pool'].get( strat_key) is None: ses['resource_pool'].update( {strat_key: { keys[-2]: f }}) else: if keys[-2] == 'anatomical_brain' or keys[ -2] == 'anatomical_brain_mask' or keys[ -2] == 'anatomical_skull_leaf': pass elif 'apply_warp_anat_longitudinal_to_standard' in keys[ -2] or 'fsl_apply_xfm_longitudinal' in keys[ -2]: # TODO update!!! # it assumes session id == last key (ordered by session count instead of session id) + 1 # might cause problem if session id is not continuous def replace_index( target1, target2, file_path): index1 = file_path.index( target1) + len(target1) index2 = file_path.index( target2) + len(target2) file_str_list = list(file_path) file_str_list[index1] = "*" file_str_list[index2] = "*" file_path_updated = "".join( file_str_list) file_list = glob.glob( file_path_updated) file_list.sort() return file_list if ses['unique_id'] == str( int(keys[-2][-1]) + 1): if keys[-3] == 'seg_probability_maps': f_list = replace_index( 'seg_probability_maps_', 'segment_prob_', f) ses['resource_pool'][ strat_key].update( {keys[-3]: f_list}) elif keys[ -3] == 'seg_partial_volume_files': f_list = replace_index( 'seg_partial_volume_files_', 'segment_pve_', f) ses['resource_pool'][ strat_key].update( {keys[-3]: f_list}) else: ses['resource_pool'][ strat_key].update({ keys[-3]: f # keys[-3]: 'anatomical_to_standard' }) elif keys[-2] != 'warp_list': ses['resource_pool'][ strat_key].update( {keys[-2]: f}) elif keys[-2] == 'warp_list': if 'ses-' + ses['unique_id'] in tmp: ses['resource_pool'][ strat_key].update( {keys[-2]: f}) for key in subject_specific_dict: ses_list = [ subj for subj in sublist if key in subj['anat'] ] for ses in ses_list: for reg_strat in strat_list: try: ss_strat_list = list(ses['resource_pool']) for strat_key in ss_strat_list: try: ses['resource_pool'][strat_key].update( { 'registration_method': reg_strat[ 'registration_method'] }) except KeyError: pass except KeyError: pass yaml.dump(sublist, open( os.path.join(c.workingDirectory, 'data_config_longitudinal.yml'), 'w'), default_flow_style=False) print('\n\n' + 'Longitudinal pipeline completed.' + '\n\n') # skip main preprocessing if 1 not in c.runAnatomical and 1 not in c.runFunctional: import sys sys.exit() # END LONGITUDINAL TEMPLATE PIPELINE # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: run_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=run_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print(p.pid, file=pid) # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc:idc + c.numParticipantsAtOnce]: p.start() print(p.pid, file=pid) job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print('found dead job ', job) loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()
def main(sub_idx): # Init variables bucket_name = 'fcp-indi' bucket_prefix = 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_rerun' config_file = '/home/ubuntu/abide_run/settings/pipeline_config_abide_rerun.yml' creds_path = '/home/ubuntu/secure-creds/aws-keys/fcp-indi-keys2.csv' local_prefix = '/mnt/eigen_run' sublist_file = '/home/ubuntu/abide_run/eig-subs1.yml' # Pull in bucket, config, and subject sublist = yaml.load(open(sublist_file, 'r')) subject = sublist[sub_idx] sub_id = subject.split('_')[-1] bucket = fetch_creds.return_bucket(creds_path, bucket_name) c = Configuration(yaml.load(open(config_file, 'r'))) # Test to see if theyre already upload to_do = True if to_do: ## Collect functional_mni list from S3 bucket filt_global = 'pipeline_abide_rerun__freq-filter/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/_bandpass_freqs_0.01.0.1/bandpassed_demeaned_filtered_antswarp.nii.gz' % sub_id filt_noglobal = filt_global.replace('global1', 'global0') nofilt_global = 'pipeline_abide_rerun/%s_session_1/functional_mni/_scan_rest_1_rest/_csf_threshold_0.96/_gm_threshold_0.7/_wm_threshold_0.96/_compcor_ncomponents_5_selector_pc10.linear1.wm0.global1.motion1.quadratic1.gm0.compcor1.csf0/residual_antswarp.nii.gz' % sub_id nofilt_noglobal = nofilt_global.replace('global1', 'global0') s3_functional_mni_list = [ filt_global, filt_noglobal, nofilt_global, nofilt_noglobal ] s3_functional_mni_list = [ os.path.join(bucket_prefix, s) for s in s3_functional_mni_list ] # Download contents to local inputs directory try: aws_utils.s3_download(bucket, s3_functional_mni_list, local_prefix=os.path.join( local_prefix, 'centrality_inputs'), bucket_prefix=bucket_prefix) except Exception as e: print 'Unable to find eigenvector centrality inputs for subject %s, skipping...' % sub_id print 'Error: %s' % e return # Build strat dict (dictionary of strategies and local input paths) strat_dict = { 'filt_global': os.path.join(local_prefix, 'centrality_inputs', filt_global), 'filt_noglobal': os.path.join(local_prefix, 'centrality_inputs', filt_noglobal), 'nofilt_noglobal': os.path.join(local_prefix, 'centrality_inputs', nofilt_noglobal), 'nofilt_global': os.path.join(local_prefix, 'centrality_inputs', nofilt_global) } # Create list of processes proc_list = [ Process(target=make_workflow, args=(in_name, strat, sub_id, c, local_prefix)) for strat, in_name in strat_dict.items() ] # Iterate through processes and fire off for p in proc_list: p.start() for p in proc_list: if p.is_alive(): p.join() # Gather outputs wfs = glob.glob(os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) local_list = [] for wf in wfs: for root, dirs, files in os.walk(wf): if files: local_list.extend([os.path.join(root, f) for f in files]) s3_list = [ loc.replace( local_prefix, 'data/Projects/ABIDE_Initiative/Outputs/cpac/raw_outputs_eigen' ) for loc in local_list ] aws_utils.s3_upload(bucket, local_list, s3_list) # And delete working directories try: for input_file in strat_dict.values(): print 'removing input file %s...' % input_file os.remove(input_file % sub_id) except Exception as e: print 'Unable to remove input files' print 'Error: %s' % e work_dirs = glob.glob( os.path.join(local_prefix, 'eigen_wf_%s_*' % sub_id)) for work_dir in work_dirs: print 'removing %s...' % work_dir shutil.rmtree(work_dir) else: print 'subject %s already processed and uploaded, skipping...' % sub_id
def AddConfig(self, event): ''' docstring ''' current_dir = os.getcwd() # Gets called when you click 'Load' for pipeline config in the GUI dlg = wx.FileDialog(self, message="Choose the CPAC Configuration file", defaultDir=current_dir, defaultFile="", wildcard="YAML files(*.yaml, *.yml)|*.yaml;*.yml", style=wx.OPEN | wx.CHANGE_DIR) # User clicks "OK" if dlg.ShowModal() == wx.ID_OK: # Load config file into memory and verify its not a subject list path = dlg.GetPath() os.chdir(current_dir) # Check for path existence if os.path.exists(path): path = os.path.realpath(path) try: f_cfg = yaml.safe_load(open(path, 'r')) except Exception as e: print('\n\nUnable to load the specified file: %s' % path) print("The YAML file may not be formatted properly.") print('Error:\n%s\n\n' % e) raise Exception if type(f_cfg) == dict: if 'pipelineName' not in f_cfg and 'pipeline_dir' not in f_cfg: err_msg = 'File is not a pipeline configuration '\ 'file. It might be a data configuration file.' raise Exception(err_msg) else: err_msg = 'File is not a pipeline configuration '\ 'file. It might be a subject list file.' raise Exception(err_msg) # Otherwise, report error else: err_msg = 'File %s does not exist. Check and try again.' % path raise Exception(err_msg) # If config file is ok, proceed to load if self.check_config(path) > 0: while True: try: c = Configuration(f_cfg) except Exception as e: if "object has no attribute" in e: err = "%s\n\nIs this attribute linked (using " \ "'${}') in any of your configuration " \ "parameters? (Standard template paths, " \ "for example). If this is a pipeline " \ "configuration file from an older version "\ "of CPAC, this parameter may be obsolete. "\ "Double-check your selections.\n\n" % e print(err) else: print('\n\nERROR: Configuration file could not ' \ 'be loaded properly - the file might be '\ 'access-protected or you might have ' \ 'chosen the wrong file.\n') print('Error name: main_window_0001\n') print('Exception: %s\n\n' % e) # Valid pipeline name pipeline_name = None try: pipeline_name = c.pipelineName except AttributeError: pass try: pipeline_name = c.pipeline_dir pipeline_name = pipeline_name.split('/')[-1].replace( 'pipeline_', '') pipeline_name = 'group_config_{0}'.format( pipeline_name) except AttributeError: pass if pipeline_name != None: if self.pipeline_map.get(pipeline_name) == None: # this runs if you click 'Load' on the main # CPAC window, enter a path, and the pipeline # name attribute of the pipeline config file # you are loading does NOT already exist in # the listbox, i.e., the proper condition self.pipeline_map[str(pipeline_name)] = path self.listbox.Append(str(pipeline_name)) dlg.Destroy() break else: # this runs if you click 'Load' on the main # CPAC window, enter a path, and the pipeline # name attribute of the pipeline config file # you are loading DOES already exist in # the listbox, which is a conflict dlg3 = wx.MessageDialog(self, 'The \'' \ 'Pipeline Name\' attribute of the ' \ 'configuration file you are loading' \ ' already exists in one of the' \ ' configuration files listed under' \ ' \'Pipelines\'.\n\nPlease change' \ ' the pipeline name attribute (not' \ ' the filename) from within the' \ ' pipeline editor (under the' \ ' \'Output Settings\' tab in' \ ' \'Environment Setup\'), or load a' \ ' new configuration file.\n\n' \ 'Pipeline configuration with' \ ' conflicting name:\n%s' \ % pipeline_name, 'Conflicting Pipeline Names', wx.OK | wx.ICON_ERROR) dlg3.ShowModal() dlg3.Destroy() break # Pipeline name is None else: dlg4 = wx.MessageDialog(self, 'Warning: Pipeline name is blank.\n\nPlease edit' \ ' the pipeline_config.yml file in a text editor and' \ ' restore the pipelineName field.', 'Warning', wx.OK | wx.ICON_ERROR) dlg4.ShowModal() dlg4.Destroy() dlg.Destroy break
def create_yaml_from_template( d, template=DEFAULT_PIPELINE_FILE, include_all=False ): """Save dictionary to a YAML file, keeping the structure (such as first level comments and ordering) from the template It may not be fully robust to YAML structures, but it works for C-PAC config files! Parameters ---------- d : dict template : str path to template include_all : bool include every key, even those that are unchanged Examples -------- >>> import yaml >>> from CPAC.utils.configuration import Configuration >>> Configuration(yaml.safe_load(create_yaml_from_template({}))).dict( ... ) == Configuration({}).dict() True """ def _count_indent(line): '''Helper method to determine indentation level Parameters ---------- line : str Returns ------- number_of_indents : int Examples -------- >>> _count_indent('No indent') 0 >>> _count_indent(' Four spaces') 2 ''' return (len(line) - len(line.lstrip())) // 2 def _create_import_dict(diff): '''Method to return a dict of only changes given a nested dict of (dict1_value, dict2_value) tuples Parameters ---------- diff : dict output of `dct_diff` Returns ------- dict dict of only changed values Examples -------- >>> _create_import_dict({'anatomical_preproc': { ... 'brain_extraction': {'extraction': { ... 'run': ([True], False), ... 'using': (['3dSkullStrip'], ['niworkflows-ants'])}}}}) {'anatomical_preproc': {'brain_extraction': {'extraction': {'run': False, 'using': ['niworkflows-ants']}}}} ''' # noqa if isinstance(diff, tuple) and len(diff) == 2: return diff[1] if isinstance(diff, dict): i = {} for k in diff: try: j = _create_import_dict(diff[k]) if j != {}: i[k] = j except KeyError: continue return i return diff def _format_key(key, level): '''Helper method to format YAML keys Parameters ---------- key : str level : int Returns ------- yaml : str Examples -------- >>> _format_key('base', 0) '\nbase: ' >>> _format_key('indented', 2) '\n indented:' ''' return f'\n{" " * level * 2}{key}: ' def _format_list_items(l, line_level): # noqa E741 '''Helper method to handle lists in the YAML Parameters ---------- l : list line_level : int Returns ------- yaml : str Examples -------- >>> _format_list_items([1, 2, {'nested': 3}], 0) ' - 1\n - 2\n - nested: 3' >>> _format_list_items([1, 2, {'nested': [3, {'deep': [4]}]}], 1) ' - 1\n - 2\n - nested:\n - 3\n - deep:\n - 4' ''' # noqa # keep short, simple lists in square brackets if all([any([isinstance(item, item_type) for item_type in { str, bool, int, float }]) for item in l]): if len(str(l)) < 50: return str(l).replace("'", '').replace('"', '') # list long or complex lists on lines with indented '-' lead-ins indent = " " * (2 * line_level + 2) return '\n' + '\n'.join([ f'{indent}{li}' for li in yaml.dump( yaml_bool(l) ).replace("'On'", 'On').replace("'Off'", 'Off').split('\n') ]).rstrip() # set starting values output = '' comment = '' space_match = r'^\s+.*' level = 0 nest = [] list_item = False list_level = 0 line_level = 0 template_name = template if isinstance(d, Configuration): d = d.dict() try: template = load_preconfig(template) except OptionError: if 'default' in template.lower(): template = DEFAULT_PIPELINE_FILE assert os.path.exists(template) or os.path.islink(template), \ f'{template_name} is not a defined preconfig or a valid path.' template_included = False # load default values d_default = Configuration(yaml.safe_load(open(template, 'r'))).dict() if ( template == DEFAULT_PIPELINE_FILE or not dct_diff( yaml.safe_load(open(DEFAULT_PIPELINE_FILE, 'r')), d_default) ): template_name = 'default' # update values if include_all: d_default.update(d) d = _create_import_dict(dct_diff({}, d_default)) else: d = _create_import_dict(dct_diff(d_default, d)) # generate YAML from template with updated values template_dict = yaml.safe_load(open(template, 'r')) with open(template, 'r') as f: for line in f: # persist comments and frontmatter if line.startswith('%') or line.startswith('---') or re.match( r'^\s*#.*$', line ): list_item = False line = line.strip('\n') comment += f'\n{line}' elif len(line.strip()): if re.match(space_match, line): line_level = _count_indent(line) else: line_level = 0 # handle lists as a unit if list_item: if line_level < list_level - 1: list_item = False level = list_level list_level = 0 elif line.lstrip().startswith('-'): list_item = True list_level = line_level - 1 else: # extract dict key key_group = re.match( r'^\s*(([a-z0-9A-Z_]+://){0,1}' r'[a-z0-9A-Z_/][\sa-z0-9A-Z_/\.-]+)\s*:', line) if key_group: if not template_included: # prepend comment from template if len(comment.strip()): comment = re.sub( r'(?<=# based on )(.* pipeline)', f'{template_name} pipeline', comment ) output += comment output += f'\nFROM: {template_name}\n' comment = '' template_included = True key = key_group.group(1).strip() # calculate key depth if line_level == level: if level > 0: nest = nest[:-1] + [key] else: nest = [key] elif line_level == level + 1: nest += [key] elif line_level < level: nest = nest[:line_level] + [key] # only include updated and new values try: # get updated value for key value = lookup_nested_value(d, nest) orig_value = lookup_nested_value(d_default, nest) # Use 'On' and 'Off' for bools if (isinstance(orig_value, bool) or ( isinstance(orig_value, str) and orig_value in {'On', 'Off'} ) or (isinstance(orig_value, list) and all([( isinstance(orig_item, bool) or ( isinstance(orig_item, str) and orig_item in {'On', 'Off'} ) ) for orig_item in orig_value]) )): value = yaml_bool(value) # prepend comment from template if len(comment.strip()): output += comment else: output += '\n' # write YAML output += _format_key(key, line_level) if isinstance(value, list): output += _format_list_items( value, line_level) elif isinstance(value, dict): for k in value.keys(): try: lookup_nested_value(template_dict, nest + [k]) # include keys not in template except KeyError: output += _format_key( k, line_level + 1) output += _format_list_items( value[k], line_level + 1 ) if isinstance( value[k], list) else yaml_bool( value[k]) else: output += str(value) except KeyError: # clear comment for excluded key comment = '\n' # reset variables for loop comment = '\n' level = line_level elif len(comment) > 1 and comment[-2] != '\n': comment += '\n' return output.lstrip('\n')