def run(self, parallel=None, n_jobs=None): """ Run the the analysis: load data, run estimation, output results """ if parallel is None: result = self.execute() elif parallel == 'local': cfg_parallel = pyhrf.cfg['parallel-local'] try: from joblib import Parallel, delayed except ImportError: raise Exception('Can not import joblib. It is required to '\ 'enable parallel processing on a local machine.') parallel_verb = pyhrf.verbose.verbosity if pyhrf.verbose.verbosity == 6: parallel_verb = 10 if n_jobs is None: n_jobs = cfg_parallel['nb_procs'] p = Parallel(n_jobs=n_jobs, verbose=parallel_verb) result = p(delayed(exec_t)(t) for t in self.split(output_dir=None)) # join list of lists: result = list(itertools.chain.from_iterable(result)) elif parallel == 'LAN': from pyhrf import grid cfg_parallel = pyhrf.cfg['parallel-LAN'] remoteUser = cfg_parallel['user'] #1. Some checks on input/output directory remoteDir = cfg_parallel['remote_path'] # At the end, results will be retrieved direclty from remoteDir, # which has to be readable if remoteDir is None or not op.exists(remoteDir): raise Exception('Remote directory is not readable (%s).' \ 'Consider mounting it with sshfs.' %remoteDir) # Try if remoteDir is writeable, so that we don't need to upload # data via ssh remote_writeable = False if os.access(remoteDir, os.W_OK): remote_writeable = True tmpDir = remoteDir else: pyhrf.verbose(1, 'Remote dir is not writeable -> using tmp ' \ 'dir to store splitted data & then upload.') #2. split roi data pyhrf.verbose(1, 'Path to store sub treatments: %s' %tmpDir) treatments_dump_files = [] self.split(dump_sub_results=True, output_dir=tmpDir, make_sub_outputs=False, output_file_list=treatments_dump_files) #3. copy data to remote directory if not remote_writeable: host = cfg_parallel['remote_host'] pyhrf.verbose(1, 'Uploading data to %s ...' %(remoteDir)) remote_input_files = remote_copy(treatments_dump_files, host, remoteUser, remoteDir) #4. create job list tasks_list = [] for f in treatments_dump_files: f = op.join(remoteDir,op.basename(f)) nice = cfg_parallel['niceness'] tasks_list.append('nice -n %d %s -v%d -t "%s"' \ %(nice,'pyhrf_jde_estim', pyhrf.verbose.verbosity,f)) mode = 'dispatch' tasks = grid.read_tasks(';'.join(tasks_list), mode) timeslot = grid.read_timeslot('allday') hosts = grid.read_hosts(cfg_parallel['hosts']) if self.output_dir is not None: brokenfile = op.join(self.output_dir, 'pyhrf-broken_cmd.batch') logfile = op.join(self.output_dir, 'pyhrf-parallel.log') pyhrf.verbose(1, 'Log file for process dispatching: %s' \ %logfile) else: brokenfile = None logfile = None #3. launch them pyhrf.verbose(1, 'Dispatching processes ...') try: grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) if brokenfile is not None and len(open(brokenfile).readlines()) > 0: pyhrf.verbose(1, 'There are some broken commands, '\ 'trying again ...') try: tasks = grid.read_tasks(brokenfile, mode) grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) #3.1 grab everything back ?? #try: # "scp %s@%s:%s %s" %(remoteUser,host, # op.join(remoteDir,'result*'), # op.abspath(op.dirname(options.cfgFile)))) #TODO : test if everything went fine #4. merge all results and create outputs result = [] #if op.exists(remoteDir): TODO :scp if remoteDir not readable nb_treatments = len(treatments_dump_files) remote_result_files = [op.join(remoteDir, 'result_%04d.pck' %i) \ for i in range(nb_treatments)] pyhrf.verbose(1,'remote_result_files: %s' %str(remote_result_files)) nres = len(filter(op.exists,remote_result_files)) if nres == nb_treatments: pyhrf.verbose(1, 'Grabbing results ...') for fnresult in remote_result_files: fresult = open(fnresult) result.append(cPickle.load(fresult)[0]) fresult.close() else: print 'Found only %d result files (expected %d)' \ %(nres, nb_treatments) print 'Something went wrong, check the log files' if not remote_writeable: pyhrf.verbose(1, 'Cleaning tmp dir (%s)...' %tmpDir) shutil.rmtree(tmpDir) pyhrf.verbose(1, 'Cleaning up remote dir (%s) through ssh ...' \ %remoteDir) cmd = 'ssh %s@%s rm -f "%s" "%s" ' \ %(remoteUser, host, ' '.join(remote_result_files), ' '.join(remote_input_files)) pyhrf.verbose(2, cmd) os.system(cmd) else: if 0: pyhrf.verbose(1, 'Cleaning up remote dir (%s)...' %remoteDir) for f in os.listdir(remoteDir): os.remove(op.join(remoteDir,f)) elif parallel == 'cluster': from pyhrf.parallel import run_soma_workflow cfg = pyhrf.cfg['parallel-cluster'] #create tmp remote path: date_now = time.strftime('%c').replace(' ','_').replace(':','_') remote_path = op.join(cfg['remote_path'], date_now) pyhrf.verbose(1,'Create tmp remote dir: %s' %remote_path) remote_mkdir(cfg['server'], cfg['user'], remote_path) #if self.result_dump_file t_name = 'default_treatment' tmp_dir = pyhrf.get_tmp_path() label_for_cluster = self.analyser.get_label() if self.output_dir is None: out_dir = pyhrf.get_tmp_path() else: out_dir = self.output_dir result = run_soma_workflow({t_name:self}, 'pyhrf_jde_estim', {t_name:tmp_dir}, cfg['server_id'], cfg['server'], cfg['user'], {t_name:remote_path}, {t_name:op.abspath(out_dir)}, label_for_cluster, wait_ending=True) else: raise Exception('Parallel mode "%s" not available' %parallel) pyhrf.verbose(1, 'Retrieved %d results' %len(result)) return self.output(result, (self.result_dump_file is not None), self.make_outputs)
def run(self, parallel=None, n_jobs=None): """ Run the analysis: load data, run estimation, output results """ if parallel is None: result = self.execute() elif parallel == 'local': cfg_parallel = pyhrf.cfg['parallel-local'] try: from joblib import Parallel, delayed except ImportError: raise Exception( 'Can not import joblib. It is required to ' 'enable parallel processing on a local machine.') effective_level = logger.getEffectiveLevel() if effective_level == logging.DEBUG: parallel_verb = 11 elif effective_level == logging.INFO: parallel_verb = 2 else: parallel_verb = 0 if n_jobs is None: if cfg_parallel["nb_procs"]: n_jobs = cfg_parallel["nb_procs"] else: n_jobs = available_cpu_count() p = Parallel(n_jobs=n_jobs, verbose=parallel_verb) result = p(delayed(exec_t)(t) for t in self.split(output_dir=None)) # join list of lists: result = list(itertools.chain.from_iterable(result)) elif parallel == 'LAN': from pyhrf import grid cfg_parallel = pyhrf.cfg['parallel-LAN'] remoteUser = cfg_parallel['user'] # 1. Some checks on input/output directory remoteDir = cfg_parallel['remote_path'] # At the end, results will be retrieved direclty from remoteDir, # which has to be readable if remoteDir is None or not op.exists(remoteDir): raise Exception('Remote directory is not readable (%s).' 'Consider mounting it with sshfs.' % remoteDir) # Try if remoteDir is writeable, so that we don't need to upload # data via ssh remote_writeable = False if os.access(remoteDir, os.W_OK): remote_writeable = True tmpDir = remoteDir else: logger.info('Remote dir is not writeable -> using tmp ' 'dir to store splitted data & then upload.') # 2. split roi data logger.info('Path to store sub treatments: %s', tmpDir) treatments_dump_files = [] self.split(dump_sub_results=True, output_dir=tmpDir, make_sub_outputs=False, output_file_list=treatments_dump_files) # 3. copy data to remote directory if not remote_writeable: host = cfg_parallel['remote_host'] logger.info('Uploading data to %s ...', remoteDir) remote_input_files = remote_copy(treatments_dump_files, host, remoteUser, remoteDir) # 4. create job list tasks_list = [] for f in treatments_dump_files: f = op.join(remoteDir, op.basename(f)) nice = cfg_parallel['niceness'] tasks_list.append( 'nice -n %d %s -v%d -t "%s"' % (nice, 'pyhrf_jde_estim', logger.getEffectiveLevel(), f)) mode = 'dispatch' tasks = grid.read_tasks(';'.join(tasks_list), mode) timeslot = grid.read_timeslot('allday') hosts = grid.read_hosts(cfg_parallel['hosts']) if self.output_dir is not None: brokenfile = op.join(self.output_dir, 'pyhrf-broken_cmd.batch') logfile = op.join(self.output_dir, 'pyhrf-parallel.log') logger.info('Log file for process dispatching: %s', logfile) else: brokenfile = None logfile = None # 3. launch them logger.info('Dispatching processes ...') try: grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) if brokenfile is not None and len( open(brokenfile).readlines()) > 0: logger.info('There are some broken commands, trying again ...') try: tasks = grid.read_tasks(brokenfile, mode) grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) # 3.1 grab everything back ?? # try: # "scp %s@%s:%s %s" %(remoteUser,host, # op.join(remoteDir,'result*'), # op.abspath(op.dirname(options.cfgFile)))) # TODO : test if everything went fine # 4. merge all results and create outputs result = [] # if op.exists(remoteDir): TODO :scp if remoteDir not readable nb_treatments = len(treatments_dump_files) remote_result_files = [ op.join(remoteDir, 'result_%04d.pck' % i) for i in range(nb_treatments) ] logger.info('remote_result_files: %s', str(remote_result_files)) nres = len(filter(op.exists, remote_result_files)) if nres == nb_treatments: logger.info('Grabbing results ...') for fnresult in remote_result_files: fresult = open(fnresult) result.append(cPickle.load(fresult)[0]) fresult.close() else: print 'Found only %d result files (expected %d)' \ % (nres, nb_treatments) print 'Something went wrong, check the log files' if not remote_writeable: logger.info('Cleaning tmp dir (%s)...', tmpDir) shutil.rmtree(tmpDir) logger.info('Cleaning up remote dir (%s) through ssh ...', remoteDir) cmd = 'ssh %s@%s rm -f "%s" "%s" ' \ % (remoteUser, host, ' '.join(remote_result_files), ' '.join(remote_input_files)) logger.info(cmd) os.system(cmd) else: if 0: logger.info('Cleaning up remote dir (%s)...', remoteDir) for f in os.listdir(remoteDir): os.remove(op.join(remoteDir, f)) elif parallel == 'cluster': from pyhrf.parallel import run_soma_workflow cfg = pyhrf.cfg['parallel-cluster'] # create tmp remote path: date_now = time.strftime('%c').replace(' ', '_').replace(':', '_') remote_path = op.join(cfg['remote_path'], date_now) logger.info('Create tmp remote dir: %s', remote_path) remote_mkdir(cfg['server'], cfg['user'], remote_path) t_name = 'default_treatment' tmp_dir = pyhrf.get_tmp_path() label_for_cluster = self.analyser.get_label() if self.output_dir is None: out_dir = pyhrf.get_tmp_path() else: out_dir = self.output_dir result = run_soma_workflow({t_name: self}, 'pyhrf_jde_estim', {t_name: tmp_dir}, cfg['server_id'], cfg['server'], cfg['user'], {t_name: remote_path}, {t_name: op.abspath(out_dir)}, label_for_cluster, wait_ending=True) else: raise Exception('Parallel mode "%s" not available' % parallel) logger.info('Retrieved %d results', len(result)) return self.output(result, (self.result_dump_file is not None), self.make_outputs)
def prepare_treatment_jobs(treatment, tmp_local_dir, local_result_path, local_user, local_host, remote_host, remote_user, remote_path, label_for_cluster): """Prepare soma-workflow jobs to perform one treatment (i.e., one subject). Parameters ---------- treatment : FMRITreatment the treatment defining the analysis tmp_local_dir : str a path where to store the temporary config file before sending it to the remote host local_result_path : str path where to store the final result local_user : str the user on the local host who enables SHH connection from the remote cluster local_host : str local host (used to send back the result) remote_host : str remote machine where the treatment will be run remote_user : str user login on the remote machine. remote_path : str path on the remote machine where to store ROI data and analysis results label_for_cluster : str label prefix to name job in soma-workflow Returns ------- a tuple (job_split, jobs, dependencies, mainGroup) job_split (Job) job handling splitting of input data into ROI data jobs (list of Job) all jobs except the splitting jobs -> roi analyses, result merge, scp of result back to local host, data cleaning dependencies (list of job pairs) define the pipeline structure mainGroup (Group) top-level object gathering all jobs for this treatment. """ # roiFiles contains the list of files that will be produced by job_split roiFiles, roiIds = treatment.dump_roi_datasets(dry=True) logger.info('Get list of splitted data files ... %d files', len(roiFiles)) datafiles = treatment.get_data_files() # Make all path be relative in the treatment config file # so that data file can be found on the cluster file system treatment.replace_data_dir('./') remote_cfg_file = op.join(tmp_local_dir, './detectestim_remote.xml') treatment.set_init_param('make_outputs', False) logger.info('Save remote treatment to %s', remote_cfg_file) save_treatment(treatment, remote_cfg_file) logger.info('Upload input data') # All data which are the inputs of the workflow: data_to_upload = datafiles + [remote_cfg_file] remote_input_files = remote_copy(data_to_upload, remote_host, remote_user, remote_path) logger.info('Remove tmp remote cfg file') os.remove(remote_cfg_file) logger.info('Prepare jobs ...') logger.info('Job split ...') verbose_level = logger.getEffectiveLevel() cmd = ["pyhrf_split_roidata", "-c", basename(remote_cfg_file), "-v %d" % verbose_level, "-d", "./"] logger.info('-> %s', cmd) job_split = Job(cmd, working_directory=remote_path, name="roi_split") logger.info('Jobs JDE ...') jobs_jde = [Job(["pyhrf_jde_estim", "-c", basename(remote_cfg_file), "-r", basename(roiFile), "-v %d" % verbose_level], working_directory=remote_path, name="jde_r%04d" % roiId) for roiFile, roiId in zip(roiFiles, roiIds)] logger.info('First jde job -> %s', jobs_jde[0].command) # Files produced by all JDE jobs, which will be then used as input of the # merge job: resultFiles = ["result_%04d.pck" % iroi for iroi in roiIds] logger.info('Job pack result ...') # Output of the merge job, which has to transfered back to local: remote_resultFile = './result.pck' logger.info('Remote result file: %s', remote_resultFile) cmd = ["pyhrf_pack_results", '-v1', '-o', remote_resultFile] + resultFiles logger.info('cmd pack result: %s', cmd) job_merge = Job(cmd, working_directory=remote_path, name="merge_results") # Retrieve result file: # local_host = "132.166.200.5" #HACK # cmd = ["pyhrf_shell_cmd", "scp","-C",remote_resultFile, "%s@%s:\"%s\"" \ #%(local_user,local_host,local_result_path)] cmd = ["scp", "-C", remote_resultFile, "%s@%s:\"%s\"" % (local_user, local_host, local_result_path)] logger.info('cmd scp result: %s', cmd) job_scp_result = Job(cmd, working_directory=remote_path, name="scp_result") # Clean everything: # -> all input files, splitted roi data, result for each data, merged result: cmd = ["rm", "-f", remote_resultFile] + \ map(basename, roiFiles) + resultFiles + remote_input_files logger.info('cmd clean: %s', cmd) job_clean = Job(cmd, working_directory=remote_path, name="clean_files") logger.info('Setup of work flow ...') # Build the Job lists, dependencies and group clean = True if clean: nodes = [job_merge, job_scp_result, job_clean] + jobs_jde else: nodes = [job_merge, job_scp_result] + jobs_jde dependencies = [] for jj in jobs_jde: dependencies.append((job_split, jj)) dependencies.append((jj, job_merge)) dependencies.append((job_merge, job_scp_result)) if clean: dependencies.append((job_scp_result, job_clean)) jjGroup = Group(elements=jobs_jde, name=label_for_cluster + '-roi_jobs') if clean: elements = [job_split, jjGroup, job_merge, job_scp_result, job_clean] else: elements = [job_split, jjGroup, job_merge, job_scp_result] mainGroup = Group(name=label_for_cluster, elements=elements) return job_split, nodes, dependencies, mainGroup
def prepare_treatment_jobs(treatment, tmp_local_dir, local_result_path, local_user, local_host, remote_host, remote_user, remote_path, label_for_cluster): """Prepare soma-workflow jobs to perform one treatment (i.e., one subject). Parameters ---------- treatment : FMRITreatment the treatment defining the analysis tmp_local_dir : str a path where to store the temporary config file before sending it to the remote host local_result_path : str path where to store the final result local_user : str the user on the local host who enables SHH connection from the remote cluster local_host : str local host (used to send back the result) remote_host : str remote machine where the treatment will be run remote_user : str user login on the remote machine. remote_path : str path on the remote machine where to store ROI data and analysis results label_for_cluster : str label prefix to name job in soma-workflow Returns ------- a tuple (job_split, jobs, dependencies, mainGroup) job_split (Job) job handling splitting of input data into ROI data jobs (list of Job) all jobs except the splitting jobs -> roi analyses, result merge, scp of result back to local host, data cleaning dependencies (list of job pairs) define the pipeline structure mainGroup (Group) top-level object gathering all jobs for this treatment. """ # roiFiles contains the list of files that will be produced by job_split roiFiles, roiIds = treatment.dump_roi_datasets(dry=True) logger.info('Get list of splitted data files ... %d files', len(roiFiles)) datafiles = treatment.get_data_files() # Make all path be relative in the treatment config file # so that data file can be found on the cluster file system treatment.replace_data_dir('./') remote_cfg_file = op.join(tmp_local_dir, './detectestim_remote.xml') treatment.set_init_param('make_outputs', False) logger.info('Save remote treatment to %s', remote_cfg_file) save_treatment(treatment, remote_cfg_file) logger.info('Upload input data') # All data which are the inputs of the workflow: data_to_upload = datafiles + [remote_cfg_file] remote_input_files = remote_copy(data_to_upload, remote_host, remote_user, remote_path) logger.info('Remove tmp remote cfg file') os.remove(remote_cfg_file) logger.info('Prepare jobs ...') logger.info('Job split ...') verbose_level = logger.getEffectiveLevel() cmd = [ "pyhrf_split_roidata", "-c", basename(remote_cfg_file), "-v %d" % verbose_level, "-d", "./" ] logger.info('-> %s', cmd) job_split = Job(cmd, working_directory=remote_path, name="roi_split") logger.info('Jobs JDE ...') jobs_jde = [ Job([ "pyhrf_jde_estim", "-c", basename(remote_cfg_file), "-r", basename(roiFile), "-v %d" % verbose_level ], working_directory=remote_path, name="jde_r%04d" % roiId) for roiFile, roiId in zip(roiFiles, roiIds) ] logger.info('First jde job -> %s', jobs_jde[0].command) # Files produced by all JDE jobs, which will be then used as input of the # merge job: resultFiles = ["result_%04d.pck" % iroi for iroi in roiIds] logger.info('Job pack result ...') # Output of the merge job, which has to transfered back to local: remote_resultFile = './result.pck' logger.info('Remote result file: %s', remote_resultFile) cmd = ["pyhrf_pack_results", '-v1', '-o', remote_resultFile] + resultFiles logger.info('cmd pack result: %s', cmd) job_merge = Job(cmd, working_directory=remote_path, name="merge_results") # Retrieve result file: # local_host = "132.166.200.5" #HACK # cmd = ["pyhrf_shell_cmd", "scp","-C",remote_resultFile, "%s@%s:\"%s\"" \ #%(local_user,local_host,local_result_path)] cmd = [ "scp", "-C", remote_resultFile, "%s@%s:\"%s\"" % (local_user, local_host, local_result_path) ] logger.info('cmd scp result: %s', cmd) job_scp_result = Job(cmd, working_directory=remote_path, name="scp_result") # Clean everything: # -> all input files, splitted roi data, result for each data, merged result: cmd = ["rm", "-f", remote_resultFile] + \ map(basename, roiFiles) + resultFiles + remote_input_files logger.info('cmd clean: %s', cmd) job_clean = Job(cmd, working_directory=remote_path, name="clean_files") logger.info('Setup of work flow ...') # Build the Job lists, dependencies and group clean = True if clean: nodes = [job_merge, job_scp_result, job_clean] + jobs_jde else: nodes = [job_merge, job_scp_result] + jobs_jde dependencies = [] for jj in jobs_jde: dependencies.append((job_split, jj)) dependencies.append((jj, job_merge)) dependencies.append((job_merge, job_scp_result)) if clean: dependencies.append((job_scp_result, job_clean)) jjGroup = Group(elements=jobs_jde, name=label_for_cluster + '-roi_jobs') if clean: elements = [job_split, jjGroup, job_merge, job_scp_result, job_clean] else: elements = [job_split, jjGroup, job_merge, job_scp_result] mainGroup = Group(name=label_for_cluster, elements=elements) return job_split, nodes, dependencies, mainGroup