def update_charon_with_local_jobs_status(config=None, config_file_path=None): """Check the status of all locally-tracked jobs and update Charon accordingly. """ LOG.info("Updating Charon with the status of all locally-tracked jobs...") with get_db_session() as session: charon_session = CharonSession() for sample_entry in session.query(SampleAnalysis).all(): # Local names workflow = sample_entry.workflow project_name = sample_entry.project_name project_id = sample_entry.project_id project_base_path = sample_entry.project_base_path sample_id = sample_entry.sample_id engine=sample_entry.engine # Only one of these will have a value slurm_job_id = sample_entry.slurm_job_id process_id = sample_entry.process_id piper_exit_code = get_exit_code(workflow_name=workflow, project_base_path=project_base_path, project_name=project_name, project_id=project_id, sample_id=sample_id) label = "project/sample {}/{}".format(project_name, sample_id) try: project_obj = create_project_obj_from_analysis_log(project_name, project_id, project_base_path, sample_id, workflow) except IOError as e: # analysis log file is missing! error_text = ('Could not find analysis log file! Cannot update ' 'Charon for sample run {}/{}: {}'.format(project_id, sample_id, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) continue try: if piper_exit_code and piper_exit_code == 0: # 0 -> Job finished successfully set_status = "ANALYZED" info_text = ('Workflow "{}" for {} finished succesfully. ' 'Recording status {} in Charon'.format(workflow, label, set_status)) LOG.info(info_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="INFO", info_text=info_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status="DONE" recurse_status_for_sample(project_obj, recurse_status) # Job is only deleted if the Charon status update succeeds session.delete(sample_entry) # Parse seqrun output results / update Charon # This is a semi-optional step -- failure here will send an # email but not more than once. The record is still removed # from the local jobs database, so this will have to be done # manually if you want it done at all. piper_qc_dir = os.path.join(project_base_path, "ANALYSIS", project_id,"piper_ngi", "02_preliminary_alignment_qc") update_coverage_for_sample_seqruns(project_id, sample_id, piper_qc_dir) elif piper_exit_code and piper_exit_code >0: # 1 -> Job failed set_status = "FAILED" error_text = ('Workflow "{}" for {} failed. Recording status ' '{} in Charon.'.format(workflow, label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, set_status) # Job is only deleted if the Charon update succeeds session.delete(sample_entry) else: # None -> Job still running OR exit code was never written (failure) JOB_FAILED = None if slurm_job_id: try: slurm_exit_code = get_slurm_job_status(slurm_job_id) except ValueError as e: slurm_exit_code = 1 if slurm_exit_code is not None: # "None" indicates job is still running JOB_FAILED = True else: if not psutil.pid_exists(process_id): # Job did not write an exit code and is also not running JOB_FAILED = True if JOB_FAILED: set_status = "FAILED" error_text = ('No exit code found but job not running for ' '{}: setting status to {} in Charon'.format(label, set_status)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, set_status) # Job is only deleted if the Charon update succeeds LOG.debug("Deleting local entry {}".format(sample_entry)) session.delete(sample_entry) else: # Job still running charon_status = charon_session.sample_get(projectid=project_id, sampleid=sample_id)['analysis_status'] if not charon_status == "UNDER_ANALYSIS": set_status = "UNDER_ANALYSIS" LOG.warn('Tracking inconsistency for {}: Charon status is "{}" but ' 'local process tracking database indicates it is running. ' 'Setting value in Charon to {}.'.format(label, charon_status, set_status)) charon_session.sample_update(projectid=project_id, sampleid=sample_id, analysis_status=set_status) recurse_status_for_sample(project_obj, "RUNNING") except CharonError as e: error_text = ('Unable to update Charon status for "{}": {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) except OSError as e: error_text = ('Permissions error when trying to update Charon ' 'status for "{}": {}'.format(label, e)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_id, engine_name=engine, level="ERROR", info_text=error_text) session.commit()
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs) except RuntimeError as e: # may want to process anyway. raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError(('"exec_mode" param must be one of "sbatch" or "local" ') ('value was "{}"'.format(exec_mode))) modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) LOG.info('Sample "{}" in project "{}" is ready for processing.'.format(sample, project)) for workflow_subtask in workflows.get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) setup_xml_cl, setup_xml_path = build_setup_xml(project=project, sample=sample, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode) remove_previous_sample_analyses(project) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample([setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs) for x in xrange(10): # Time delay to let sbatch get its act together (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: ## FIXME Now this is broken again raise NotImplementedError("Sorry dude it's a no-go") slurm_job_id = None launch_piper_job(setup_xml_cl, project) process_handle = launch_piper_job(piper_cl, project) process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error('Could not record process for project/sample ' '{}/{}, workflow {}'.format(project, sample, workflow_subtask)) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg)
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs) except RuntimeError as e: # may want to process anyway. raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError( ('"exec_mode" param must be one of "sbatch" or "local" ')( 'value was "{}"'.format(exec_mode))) modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) LOG.info('Sample "{}" in project "{}" is ready for processing.'.format( sample, project)) for workflow_subtask in workflows.get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local( workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: log_file_path = create_log_file_path( workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path( workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) setup_xml_cl, setup_xml_path = build_setup_xml( project=project, sample=sample, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode) remove_previous_sample_analyses(project) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample( [setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs) for x in xrange( 10 ): # Time delay to let sbatch get its act together (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: ## FIXME Now this is broken again raise NotImplementedError("Sorry dude it's a no-go") slurm_job_id = None launch_piper_job(setup_xml_cl, project) process_handle = launch_piper_job(piper_cl, project) process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error('Could not record process for project/sample ' '{}/{}, workflow {}'.format( project, sample, workflow_subtask)) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg)