def prepare_treatment_jobs(treatment, tmp_local_dir, local_result_path, local_user, local_host, remote_host, remote_user, remote_path, label_for_cluster): """ Prepare somaworkflow jobs to perform one treatment (ie one subject). Args: treatment (FMRITreatment): the treatment defining the analysis tmp_local_dir (str): a path where to store the temporary config file before sending it to the remote host local_result_path (str): path where to store the final result local_user (str): the user on the local host who enables SHH connection from the remote cluster local_host (str): local host (used to send back the result) remote_host (str): remote machine where the treatment will be run remote_user (str): user login on the remote machine. remote_path (str): path on the remote machine where to store ROI data and analysis results label_for_cluster (str): label prefix to name job in somaworkflow Returns: a tuple (job_split, jobs, dependencies, mainGroup) job_split (Job): job handling splitting of input data into ROI data jobs (list of Job): all jobs except the splitting jobs -> roi analyses, result merge, scp of result back to local host, data cleaning dependencies (list of Job pairs): define the pipeline structure mainGroup (Group): top-level object gathering all jobs for this treatment. """ # roiFiles contains the list of files that will be produced by job_split roiFiles, roiIds = treatment.dump_roi_datasets(dry=True) pyhrf.verbose(1, 'Get list of splitted data files ... %d files' \ %len(roiFiles)) datafiles = treatment.get_data_files() # Make all path be relative in the treatment config file # so that data file can be found on the cluster file system treatment.replace_data_dir('./') remote_cfg_file = op.join(tmp_local_dir,'./detectestim_remote.xml') treatment.set_init_param('make_outputs', False) pyhrf.verbose(1, 'Save remote treatment to %s' %remote_cfg_file) save_treatment(treatment, remote_cfg_file) pyhrf.verbose(1, 'Upload input data') # All data which are the inputs of the workflow: data_to_upload = datafiles+[remote_cfg_file] remote_input_files = remote_copy(data_to_upload, remote_host, remote_user, remote_path) #print 'remote_input_files:' #print remote_input_files pyhrf.verbose(1, 'Remove tmp remote cfg file') os.remove(remote_cfg_file) pyhrf.verbose(1, 'Prepare jobs ...') pyhrf.verbose(1, 'Job split ...') verbosity = pyhrf.verbose.verbosity cmd = ["pyhrf_split_roidata","-c", basename(remote_cfg_file), "-v %d" %verbosity, "-d", "./"] pyhrf.verbose(2, '-> %s' %cmd) job_split = Job(cmd, working_directory=remote_path, name="roi_split") pyhrf.verbose(1, 'Jobs JDE ...') jobs_jde = [Job(["pyhrf_jde_estim","-c", basename(remote_cfg_file), "-r", basename(roiFile), "-v %d" %verbosity], working_directory=remote_path, name="jde_r%04d" %roiId) for roiFile, roiId in zip(roiFiles, roiIds)] pyhrf.verbose(2, 'First jde job -> %s' %jobs_jde[0].command) # Files produced by all JDE jobs, which will be then used as input of the # merge job: resultFiles = ["result_%04d.pck" %iroi for iroi in roiIds] pyhrf.verbose(1, 'Job pack result ...') # Output of the merge job, which has to transfered back to local: remote_resultFile = './result.pck' pyhrf.verbose(1, 'Remote result file: %s' %remote_resultFile) cmd = ["pyhrf_pack_results",'-v1','-o',remote_resultFile]+resultFiles pyhrf.verbose(3, 'cmd pack result: %s' %cmd) job_merge = Job(cmd, working_directory=remote_path, name="merge_results") # Retrieve result file: #local_host = "132.166.200.5" #HACK #cmd = ["pyhrf_shell_cmd", "scp","-C",remote_resultFile, "%s@%s:\"%s\"" \ #%(local_user,local_host,local_result_path)] cmd = ["scp","-C",remote_resultFile, "%s@%s:\"%s\"" \ %(local_user,local_host,local_result_path)] pyhrf.verbose(2, 'cmd scp result: %s' %cmd) job_scp_result = Job(cmd, working_directory=remote_path, name="scp_result") # Clean everything: # -> all input files, splitted roi data, result for each data, merged result: #cmd = ["pyhrf_shell_cmd", "rm","-f", remote_resultFile] + \ #map(basename, roiFiles) + resultFiles + remote_input_files #pyhrf.verbose(3, 'cmd clean: %s' %cmd) cmd = ["rm","-f", remote_resultFile] + \ map(basename, roiFiles) + resultFiles + remote_input_files pyhrf.verbose(3, 'cmd clean: %s' %cmd) job_clean = Job(cmd, working_directory=remote_path, name="clean_files") pyhrf.verbose(1,'Setup of work flow ...') # Build the Job lists, dependencies and group clean = True if clean: nodes = [job_merge,job_scp_result,job_clean] + jobs_jde else: nodes = [job_merge,job_scp_result] + jobs_jde dependencies = [] for jj in jobs_jde: dependencies.append((job_split,jj)) dependencies.append((jj,job_merge)) dependencies.append((job_merge,job_scp_result)) if clean: dependencies.append((job_scp_result,job_clean)) jjGroup = Group(elements=jobs_jde, name=label_for_cluster+'-roi_jobs') if clean: elements = [job_split,jjGroup,job_merge, job_scp_result,job_clean] else: elements = [job_split,jjGroup,job_merge, job_scp_result] mainGroup = Group(name=label_for_cluster, elements=elements) return job_split, nodes, dependencies, mainGroup
def run(self, parallel=None, n_jobs=None): """ Run the the analysis: load data, run estimation, output results """ if parallel is None: result = self.execute() elif parallel == 'local': cfg_parallel = pyhrf.cfg['parallel-local'] try: from joblib import Parallel, delayed except ImportError: print 'Can not import joblib. It is required to enable '\ 'parallel processing on a local machine.' sys.exit(1) parallel_verb = pyhrf.verbose.verbosity if pyhrf.verbose.verbosity == 6: parallel_verb = 10 if n_jobs is None: n_jobs = cfg_parallel['nb_procs'] p = Parallel(n_jobs=n_jobs, verbose=parallel_verb) result = p(delayed(exec_t)(t) for t in self.split()) # join list of lists: result = list(itertools.chain.from_iterable(result)) elif parallel == 'LAN': from pyhrf import grid cfg_parallel = pyhrf.cfg['parallel-LAN'] remoteUser = cfg_parallel['user'] #1. Some checks on input/output directory remoteDir = cfg_parallel['remote_path'] # At the end, results will be retrieved direclty from remoteDir, # which has to be readable if remoteDir is None or not op.exists(remoteDir): raise Exception('Remote directory is not readable (%s)' \ %remoteDir) # Try if remoteDir is writeable, so that we don't need to upload # data via ssh remote_writeable = False if os.access(remoteDir, os.W_OK): remote_writeable = True tmpDir = remoteDir else: pyhrf.verbose(1, 'Remote dir is not writeable -> using tmp ' \ 'dir to store splitted data & then upload.') #2. split roi data pyhrf.verbose(1, 'Path to store sub treatments: %s' %tmpDir) treatments_dump_files = [] self.split(dump_sub_results=True, output_dir=tmpDir, make_sub_outputs=False, output_file_list=treatments_dump_files) #3. copy data to remote directory if not remote_writeable: host = cfg_parallel['remote_host'] pyhrf.verbose(1, 'Uploading data to %s ...' %(remoteDir)) remote_input_files = pio.remote_copy(treatments_dump_files, host, remoteUser, remoteDir) #4. create job list tasks_list = [] for f in treatments_dump_files: f = op.join(remoteDir,op.basename(f)) nice = cfg_parallel['niceness'] tasks_list.append('nice -n %d %s -v%d -t "%s"' \ %(nice,'pyhrf_jde_estim', pyhrf.verbose.verbosity,f)) mode = 'dispatch' tasks = grid.read_tasks(';'.join(tasks_list), mode) timeslot = grid.read_timeslot('allday') hosts = grid.read_hosts(cfg_parallel['hosts']) brokenfile = op.join(tmpDir, 'pyhrf-broken_cmd.batch') logfile = op.join(self.output_dir, 'pyhrf-parallel.log') pyhrf.verbose(1, 'Log file for process dispatching: %s' \ %logfile) #3. launch them pyhrf.verbose(1, 'Dispatching processes ...') try: grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) if len(open(brokenfile).readlines()) > 0: pyhrf.verbose(1, 'There are some broken commands, '\ 'trying again ...') try: tasks = grid.read_tasks(brokenfile, mode) grid.run_grid(mode, hosts, 'rsa', tasks, timeslot, brokenfile, logfile, user=remoteUser) grid.kill_threads() except KeyboardInterrupt: grid.quit(None, None) #3.1 grab everything back ?? #try: # "scp %s@%s:%s %s" %(remoteUser,host, # op.join(remoteDir,'result*'), # op.abspath(op.dirname(options.cfgFile)))) #TODO : test if everything went fine #4. merge all results and create outputs result = [] #if op.exists(remoteDir): TODO :scp if remoteDir not readable nb_treatments = len(treatments_dump_files) remote_result_files = [op.join(remoteDir, 'result_%04d.pck' %i) \ for i in range(nb_treatments)] pyhrf.verbose(1,'remote_result_files: %s', str(remote_result_files)) nres = len(filter(op.exists,remote_result_files)) if nres == nb_treatments: pyhrf.verbose(1, 'Grabbing results ...') for fnresult in remote_result_files: fresult = open(fnresult) result.append(cPickle.load(fresult)) fresult.close() else: print 'Found only %d result files (expected %d)' \ %(nres, nb_treatments) print 'Something went wrong, check the log files' if not remote_writeable: pyhrf.verbose(1, 'Cleaning tmp dir (%s)...' %tmpDir) shutil.rmtree(tmpDir) pyhrf.verbose(1, 'Cleaning up remote dir (%s) through ssh ...' \ %remoteDir) cmd = 'ssh %s@%s rm -f "%s" "%s" "%s"' \ %(remoteUser, host, ' '.join(remote_result_files), ' '.join(remote_input_files)) pyhrf.verbose(2, cmd) os.system(cmd) else: pyhrf.verbose(1, 'Cleaning up remote dir (%s)...' %remoteDir) for f in os.listdir(remoteDir): os.remove(op.join(remoteDir,f)) elif parallel == 'cluster': from pyhrf.parallel import run_soma_workflow cfg = pyhrf.cfg['parallel-cluster'] #create tmp remote path: date_now = time.strftime('%c').replace(' ','_').replace(':','_') remote_path = op.join(cfg['remote_path'], date_now) pyhrf.verbose(1,'Create tmp remote dir: %s' %remote_path) pio.remote_mkdir(cfg['server'], cfg['user'], remote_path) #if self.result_dump_file t_name = 'default_treatment' tmp_dir = pyhrf.get_tmp_path() label_for_cluster = self.analyser.get_label() if self.output_dir is None: out_dir = pyhrf.get_tmp_path() else: out_dir = self.output_dir result = run_soma_workflow({t_name:self}, 'pyhrf_jde_estim', {t_name:tmp_dir}, cfg['server_id'], cfg['server'], cfg['user'], {t_name:remote_path}, {t_name:op.abspath(out_dir)}, label_for_cluster, wait_ending=True) else: raise Exception('Parallel mode "%s" not available' %parallel) pyhrf.verbose(1, 'Retrieved %d results' %len(result)) return self.output(result, (self.result_dump_file is not None), self.make_outputs)