def load_config_yml(config_file, individual=False): # loads a configuration YAML file # # input # config_file: full filepath to YAML (.yml) file # # output # config: Configuration object import os import yaml from CPAC.utils import Configuration try: config_path = os.path.realpath(config_file) with open(config_path, "r") as f: config_dict = yaml.load(f) config = Configuration(config_dict) except Exception as e: err = "\n\n[!] CPAC says: Could not load or read the configuration " \ "YAML file:\n%s\nDetails: %s\n\n" % (config_file, e) raise Exception(err) if individual: config.logDirectory = os.path.abspath(config.logDirectory) config.workingDirectory = os.path.abspath(config.workingDirectory) config.outputDirectory = os.path.abspath(config.outputDirectory) config.crashLogDirectory = os.path.abspath(config.crashLogDirectory) return config
def run(config_file, subject_list_file, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False): ''' ''' # Import packages import commands import os import pickle import time from CPAC.pipeline.cpac_pipeline import prep_workflow # Init variables config_file = os.path.realpath(config_file) subject_list_file = os.path.realpath(subject_list_file) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.load(open(config_file, 'r'))) except IOError: print "config file %s doesn't exist" % config_file raise except Exception as e: raise Exception("Error reading config file - {0}\n\nError details:" "\n{1}\n\n".format(config_file, e)) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation validate(c) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: with open(subject_list_file, 'r') as sf: sublist = yaml.load(sf) except: print "Subject list is not in proper YAML format. Please check " \ "your file" raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_'+ str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_'+ str(id)) sub_scan_map[s] = scan_ids except: print "\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n" raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: track_run(level='participant', participants=len(sublist)) # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: prep_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [Process(target=prep_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args)) for sub in sublist] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print >>pid, p.pid # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc: idc+c.numParticipantsAtOnce]: p.start() print >>pid, p.pid job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print 'found dead job ', job loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()
def run(config_file, subject_list_file, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None): ''' ''' # Import packages import commands import os import pickle import time from CPAC.pipeline.cpac_pipeline import prep_workflow # Init variables config_file = os.path.realpath(config_file) subject_list_file = os.path.realpath(subject_list_file) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.load(open(config_file, 'r'))) except IOError: print "config file %s doesn't exist" % config_file raise except Exception as e: raise Exception("Error reading config file - {0}\n\nError details:" "\n{1}\n\n".format(config_file, e)) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation validate(c) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: with open(subject_list_file, 'r') as sf: sublist = yaml.load(sf) except: print "Subject list is not in proper YAML format. Please check " \ "your file" raise Exception # NOTE: strategies list is only needed in cpac_pipeline prep_workflow for # creating symlinks strategies = sorted(build_strategies(c)) # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_' + str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_' + str(id)) sub_scan_map[s] = scan_ids except: print "\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n" raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: track_run(level='participant', participants=len(sublist)) # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Create strategies file strategies_file = os.path.join(cluster_files_dir, 'strategies.pkl') with open(strategies_file, 'w') as f: pickle.dump(strategies, f) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, strategies_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, strategies_file, cluster_files_dir) # Run on one computer else: if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: prep_workflow(sub, c, strategies, 1, pipeline_timing_info, p_name, plugin, plugin_args) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=prep_workflow, args=(sub, c, strategies, 1, pipeline_timing_info, p_name, plugin, plugin_args)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print >> pid, p.pid # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc:idc + c.numParticipantsAtOnce]: p.start() print >> pid, p.pid job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print 'found dead job ', job loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()
def run(subject_list_file, config_file=None, p_name=None, plugin=None, plugin_args=None, tracking=True, num_subs_at_once=None, debug=False, test_config=False): # Import packages import commands import os import pickle import time from CPAC.pipeline.cpac_pipeline import prep_workflow print('Run called with config file {0}'.format(config_file)) if not config_file: import pkg_resources as p config_file = \ p.resource_filename("CPAC", os.path.join("resources", "configs", "pipeline_config_template.yml")) # Init variables sublist = None config_file = os.path.realpath(config_file) if '.yaml' in subject_list_file or '.yml' in subject_list_file: subject_list_file = os.path.realpath(subject_list_file) else: from CPAC.utils.bids_utils import collect_bids_files_configs, \ bids_gen_cpac_sublist (file_paths, config) = collect_bids_files_configs(subject_list_file, None) sublist = bids_gen_cpac_sublist(subject_list_file, file_paths, config, None) if not sublist: import sys print("Did not find data in {0}".format(subject_list_file)) sys.exit(1) # take date+time stamp for run identification purposes unique_pipeline_id = strftime("%Y%m%d%H%M%S") pipeline_start_stamp = strftime("%Y-%m-%d_%H:%M:%S") # Load in pipeline config file try: if not os.path.exists(config_file): raise IOError else: c = Configuration(yaml.load(open(config_file, 'r'))) except IOError: print "config file %s doesn't exist" % config_file raise except yaml.parser.ParserError as e: error_detail = "\"%s\" at line %d" % ( e.problem, e.problem_mark.line ) raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, error_detail) ) except Exception as e: raise Exception( "Error parsing config file: {0}\n\n" "Error details:\n" " {1}" "\n\n".format(config_file, e) ) c.logDirectory = os.path.abspath(c.logDirectory) c.workingDirectory = os.path.abspath(c.workingDirectory) if 's3://' not in c.outputDirectory: c.outputDirectory = os.path.abspath(c.outputDirectory) c.crashLogDirectory = os.path.abspath(c.crashLogDirectory) if debug: c.write_debugging_outputs = "[1]" if num_subs_at_once: if not str(num_subs_at_once).isdigit(): raise Exception('[!] Value entered for --num_cores not a digit.') c.numParticipantsAtOnce = int(num_subs_at_once) # Do some validation if not c.workingDirectory: raise Exception('Working directory not specified') if len(c.workingDirectory) > 70: warnings.warn("We recommend that the working directory full path " "should have less then 70 characters. " "Long paths might not work in your operational system.") warnings.warn("Current working directory: %s" % c.workingDirectory) # Get the pipeline name p_name = p_name or c.pipelineName # Load in subject list try: if not sublist: with open(subject_list_file, 'r') as sf: sublist = yaml.load(sf) except: print "Subject list is not in proper YAML format. Please check " \ "your file" raise Exception # Populate subject scan map sub_scan_map = {} try: for sub in sublist: if sub['unique_id']: s = sub['subject_id'] + "_" + sub["unique_id"] else: s = sub['subject_id'] scan_ids = ['scan_anat'] if 'func' in sub: for id in sub['func']: scan_ids.append('scan_'+ str(id)) if 'rest' in sub: for id in sub['rest']: scan_ids.append('scan_'+ str(id)) sub_scan_map[s] = scan_ids except: print "\n\n" + "ERROR: Subject list file not in proper format - " \ "check if you loaded the correct file?" + "\n" + \ "Error name: cpac_runner_0001" + "\n\n" raise Exception pipeline_timing_info = [] pipeline_timing_info.append(unique_pipeline_id) pipeline_timing_info.append(pipeline_start_stamp) pipeline_timing_info.append(len(sublist)) if tracking: try: track_run(level='participant', participants=len(sublist)) except: pass # If we're running on cluster, execute job scheduler if c.runOnGrid: # Create cluster log dir cluster_files_dir = os.path.join(c.logDirectory, 'cluster_files') if not os.path.exists(cluster_files_dir): os.makedirs(cluster_files_dir) # Check if its a condor job, and run that if 'condor' in c.resourceManager.lower(): run_condor_jobs(c, config_file, subject_list_file, p_name) # All other schedulers are supported else: run_cpac_on_cluster(config_file, subject_list_file, cluster_files_dir) # Run on one computer else: if not os.path.exists(c.workingDirectory): try: os.makedirs(c.workingDirectory) except: err = "\n\n[!] CPAC says: Could not create the working " \ "directory: %s\n\nMake sure you have permissions " \ "to write to this directory.\n\n" % c.workingDirectory raise Exception(err) # If it only allows one, run it linearly if c.numParticipantsAtOnce == 1: for sub in sublist: prep_workflow(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config) return pid = open(os.path.join(c.workingDirectory, 'pid.txt'), 'w') # Init job queue job_queue = [] # Allocate processes processes = [ Process(target=prep_workflow, args=(sub, c, True, pipeline_timing_info, p_name, plugin, plugin_args, test_config)) for sub in sublist ] # If we're allocating more processes than are subjects, run them all if len(sublist) <= c.numParticipantsAtOnce: for p in processes: p.start() print >>pid, p.pid # Otherwise manage resources to run processes incrementally else: idx = 0 while idx < len(sublist): # If the job queue is empty and we haven't started indexing if len(job_queue) == 0 and idx == 0: # Init subject process index idc = idx # Launch processes (one for each subject) for p in processes[idc: idc+c.numParticipantsAtOnce]: p.start() print >>pid, p.pid job_queue.append(p) idx += 1 # Otherwise, jobs are running - check them else: # Check every job in the queue's status for job in job_queue: # If the job is not alive if not job.is_alive(): # Find job and delete it from queue print 'found dead job ', job loc = job_queue.index(job) del job_queue[loc] # ...and start the next available process # (subject) processes[idx].start() # Append this to job queue and increment index job_queue.append(processes[idx]) idx += 1 # Add sleep so while loop isn't consuming 100% of CPU time.sleep(2) # Close PID txt file to indicate finish pid.close()