def run(subject_list_file, config_file=None, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False): # Import packages import commands import os import pickle import time from CPAC.pipeline.cpac_pipeline import prep_workflow print('Run called with config file {0}'.format(config_file)) if not config_file: import pkg_resources as p config_file = \ p.resource_filename("CPAC", os.path.join("resources", "configs", "pipeline_config_template.yml")) # Init variables sublist = None config_file = os.path.realpath(config_file) if '.yaml' in subject_list_file or '.yml' in subject_list_file: subject_list_file = os.path.realpath(subject_list_file) else: from CPAC.utils.bids_utils import collect_bids_files_configs, \ bids_gen_cpac_sublist (file_paths, config) = collect_bids_files_configs(subject_list_file, None) sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config, None) if not sublist: import sys print("Did not find data in {0}".format(subject_list_file)) sys.exit(1) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.load(open(config_file, 'r'))) except IOError: print "config file %s doesn't exist" % config_file raise except yaml.parser.ParserError as e: error_detail = "\"%s\" at line %d" % ( e.problem, e.problem_mark.line ) raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, error_detail) ) except Exception as e: raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, e) ) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation if not c.workingDirectory: raise Exception('Working directory not specified') if len(c.workingDirectory) > 70: warnings.warn("We recommend that the working directory full path " "should have less then 70 characters. " "Long paths might not work in your operational system.") warnings.warn("Current working directory: %s" % c.workingDirectory) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: if not sublist: with open(subject_list_file, 'r') as sf: sublist = yaml.load(sf) except: print "Subject list is not in proper YAML format. Please check " \ "your file" raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_'+ str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_'+ str(id)) sub_scan_map[s] = scan_ids except: print "\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n" raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: try: track_run(level='participant', participants=len(sublist)) except: pass # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: prep_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=prep_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print >>pid, p.pid # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc: idc+c.numParticipantsAtOnce]: p.start() print >>pid, p.pid job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print 'found dead job ', job loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()
def run(subject_list_file, config_file=None, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False): # Import packages import subprocess import os import pickle import time from CPAC.pipeline.cpac_pipeline import run_workflow print('Run called with config file {0}'.format(config_file)) if not config_file: import pkg_resources as p config_file = \ p.resource_filename("CPAC", os.path.join("resources", "configs", "pipeline_config_template.yml")) # Init variables sublist = None if '.yaml' in subject_list_file or '.yml' in subject_list_file: subject_list_file = os.path.realpath(subject_list_file) else: from CPAC.utils.bids_utils import collect_bids_files_configs, \ bids_gen_cpac_sublist (file_paths, config) = collect_bids_files_configs(subject_list_file, None) sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config, None) if not sublist: import sys print("Did not find data in {0}".format(subject_list_file)) sys.exit(1) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file config_file = os.path.realpath(config_file) try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.safe_load(open(config_file, 'r'))) except IOError: print("config file %s doesn't exist" % config_file) raise except yaml.parser.ParserError as e: error_detail = "\"%s\" at line %d" % (e.problem, e.problem_mark.line) raise Exception("Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, error_detail)) except Exception as e: raise Exception("Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, e)) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation if not c.workingDirectory: raise Exception('Working directory not specified') if len(c.workingDirectory) > 70: warnings.warn("We recommend that the working directory full path " "should have less then 70 characters. " "Long paths might not work in your operational system.") warnings.warn("Current working directory: %s" % c.workingDirectory) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: if not sublist: sublist = yaml.safe_load(open(subject_list_file, 'r')) except: print("Subject list is not in proper YAML format. Please check " \ "your file") raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_' + str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_' + str(id)) sub_scan_map[s] = scan_ids except: print("\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n") raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: try: track_run(level='participant' if not test_config else 'test', participants=len(sublist)) except: print("Usage tracking failed for this run.") # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: # Create working dir if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) ''' if not os.path.exists(c.logDirectory): try: os.makedirs(c.logDirectory) except: err = "\n\n[!] CPAC says: Could not create the log " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.logDirectory raise Exception(err) ''' # BEGIN LONGITUDINAL TEMPLATE PIPELINE if hasattr(c, 'run_longitudinal') and ('anat' in c.run_longitudinal or 'func' in c.run_longitudinal): subject_id_dict = {} for sub in sublist: if sub['subject_id'] in subject_id_dict: subject_id_dict[sub['subject_id']].append(sub) else: subject_id_dict[sub['subject_id']] = [sub] # subject_id_dict has the subject_id as keys and a list of sessions for # each participant as value valid_longitudinal_data = False for subject_id, sub_list in subject_id_dict.items(): if len(sub_list) > 1: valid_longitudinal_data = True if 'func' in c.run_longitudinal: raise Exception( "\n\n[!] Error: Functional longitudinal pipeline is still in development and will be available in next release. Please only run anatomical longitudinal pipeline for now.\n\n" ) if 'anat' in c.run_longitudinal: strat_list = anat_longitudinal_wf( subject_id, sub_list, c) elif len(sub_list) == 1: warnings.warn( "\n\nThere is only one anatomical session for sub-%s. Longitudinal preprocessing will be skipped for this subject.\n\n" % subject_id) # TODO # if 'func' in c.run_longitudinal: # strat_list = func_preproc_longitudinal_wf(subject_id, sub_list, c) # func_longitudinal_template_wf(subject_id, strat_list, c) if valid_longitudinal_data: rsc_file_list = [] for dirpath, dirnames, filenames in os.walk(c.outputDirectory): for f in filenames: # TODO is there a better way to check output folder name? if f != '.DS_Store' and 'pipeline_analysis_longitudinal' in dirpath: rsc_file_list.append(os.path.join(dirpath, f)) subject_specific_dict = { subj: [] for subj in subject_id_dict.keys() } session_specific_dict = { os.path.join(session['subject_id'], session['unique_id']): [] for session in sublist } for rsc_path in rsc_file_list: key = [ s for s in session_specific_dict.keys() if s in rsc_path ] if key: session_specific_dict[key[0]].append(rsc_path) else: subj = [ s for s in subject_specific_dict.keys() if s in rsc_path ] if subj: subject_specific_dict[subj[0]].append(rsc_path) # update individual-specific outputs: # anatomical_brain, anatomical_brain_mask and anatomical_reorient for key in session_specific_dict.keys(): for f in session_specific_dict[key]: sub, ses = key.split('/') ses_list = [ subj for subj in sublist if sub in subj['subject_id'] and ses in subj['unique_id'] ] if len(ses_list) > 1: raise Exception( "There are several files containing " + f) if len(ses_list) == 1: ses = ses_list[0] subj_id = ses['subject_id'] tmp = f.split(c.outputDirectory)[-1] keys = tmp.split(os.sep) if keys[0] == '': keys = keys[1:] if len(keys) > 1: if ses.get('resource_pool') is None: ses['resource_pool'] = { keys[0].split(c.pipelineName + '_')[-1]: { keys[-2]: f } } else: strat_key = keys[0].split(c.pipelineName + '_')[-1] if ses['resource_pool'].get( strat_key) is None: ses['resource_pool'].update( {strat_key: { keys[-2]: f }}) else: ses['resource_pool'][strat_key].update( {keys[-2]: f}) for key in subject_specific_dict: for f in subject_specific_dict[key]: ses_list = [ subj for subj in sublist if key in subj['anat'] ] for ses in ses_list: tmp = f.split(c.outputDirectory)[-1] keys = tmp.split(os.sep) if keys[0] == '': keys = keys[1:] if len(keys) > 1: if ses.get('resource_pool') is None: ses['resource_pool'] = { keys[0].split(c.pipelineName + '_')[-1]: { keys[-2]: f } } else: strat_key = keys[0].split(c.pipelineName + '_')[-1] if ses['resource_pool'].get( strat_key) is None: ses['resource_pool'].update( {strat_key: { keys[-2]: f }}) else: if keys[-2] == 'anatomical_brain' or keys[ -2] == 'anatomical_brain_mask' or keys[ -2] == 'anatomical_skull_leaf': pass elif 'apply_warp_anat_longitudinal_to_standard' in keys[ -2] or 'fsl_apply_xfm_longitudinal' in keys[ -2]: # TODO update!!! # it assumes session id == last key (ordered by session count instead of session id) + 1 # might cause problem if session id is not continuous def replace_index( target1, target2, file_path): index1 = file_path.index( target1) + len(target1) index2 = file_path.index( target2) + len(target2) file_str_list = list(file_path) file_str_list[index1] = "*" file_str_list[index2] = "*" file_path_updated = "".join( file_str_list) file_list = glob.glob( file_path_updated) file_list.sort() return file_list if ses['unique_id'] == str( int(keys[-2][-1]) + 1): if keys[-3] == 'seg_probability_maps': f_list = replace_index( 'seg_probability_maps_', 'segment_prob_', f) ses['resource_pool'][ strat_key].update( {keys[-3]: f_list}) elif keys[ -3] == 'seg_partial_volume_files': f_list = replace_index( 'seg_partial_volume_files_', 'segment_pve_', f) ses['resource_pool'][ strat_key].update( {keys[-3]: f_list}) else: ses['resource_pool'][ strat_key].update({ keys[-3]: f # keys[-3]: 'anatomical_to_standard' }) elif keys[-2] != 'warp_list': ses['resource_pool'][ strat_key].update( {keys[-2]: f}) elif keys[-2] == 'warp_list': if 'ses-' + ses['unique_id'] in tmp: ses['resource_pool'][ strat_key].update( {keys[-2]: f}) for key in subject_specific_dict: ses_list = [ subj for subj in sublist if key in subj['anat'] ] for ses in ses_list: for reg_strat in strat_list: try: ss_strat_list = list(ses['resource_pool']) for strat_key in ss_strat_list: try: ses['resource_pool'][strat_key].update( { 'registration_method': reg_strat[ 'registration_method'] }) except KeyError: pass except KeyError: pass yaml.dump(sublist, open( os.path.join(c.workingDirectory, 'data_config_longitudinal.yml'), 'w'), default_flow_style=False) print('\n\n' + 'Longitudinal pipeline completed.' + '\n\n') # skip main preprocessing if 1 not in c.runAnatomical and 1 not in c.runFunctional: import sys sys.exit() # END LONGITUDINAL TEMPLATE PIPELINE # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: run_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=run_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print(p.pid, file=pid) # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc:idc + c.numParticipantsAtOnce]: p.start() print(p.pid, file=pid) job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print('found dead job ', job) loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()