def analyze_seqrun(project, sample, libprep, seqrun, config=None, config_file_path=None): """Analyze data at the sequencing run (individual fastq) level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param NGILibraryPrep libprep: The library prep to analyzed :seqrun NGISeqrun seqrun: The sequencing run to analyzed :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) """ modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) for workflow_subtask in get_subtasks_for_level(level="seqrun"): if not is_seqrun_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name, libprep_id=libprep.name, seqrun_id=seqrun.name): try: ## Temporarily logging to a file until we get ELK set up log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name, libprep_id=libprep.name, seqrun_id=seqrun.name) rotate_log(log_file_path) # Store the exit code of detached processes exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name, libprep_id=libprep.name, seqrun_id=seqrun.name) build_setup_xml(project, config, sample, libprep.name, seqrun.name) command_line = build_piper_cl(project, workflow_subtask, exit_code_path, config) p_handle = launch_piper_job(command_line, project, log_file_path) try: record_process_seqrun(project=project, sample=sample, libprep=libprep, seqrun=seqrun, workflow_subtask=workflow_subtask, analysis_module_name="piper_ngi", analysis_dir=project.analysis_dir, pid=p_handle.pid) except CharonError as e: ## This is a problem. If the job isn't recorded, we won't ## ever know that it has been run and its results will be ignored. ## I think? Or no I guess if it's relaunched then the results will be there. ## But we will have multiple processes running. ## FIXME fix this LOG.error("<Could not record ...>") continue except (NotImplementedError, RuntimeError) as e: error_msg = ('Processing project "{}" / sample "{}" / libprep "{}" / ' 'seqrun "{}" failed: {}'.format(project, sample, libprep, seqrun, e.__repr__())) LOG.error(error_msg)
def analyze_sample(project, sample, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) """ modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) charon_session = CharonSession() # Determine if we can begin sample-level processing yet. # Conditions are that the coverage is above 28.9X # If these conditions become more complex we can create a function for this sample_total_autosomal_coverage = charon_session.sample_get(project.project_id, sample.name).get('total_autosomal_coverage') if sample_total_autosomal_coverage > 28.4: LOG.info('Sample "{}" in project "{}" is ready for processing.'.format(sample, project)) for workflow_subtask in get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: ## Temporarily logging to a file until we get ELK set up log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name) rotate_log(log_file_path) # Store the exit code of detached processes exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.name, sample_id=sample.name) build_setup_xml(project, config, sample) command_line = build_piper_cl(project, workflow_subtask, exit_code_path, config) p_handle = launch_piper_job(command_line, project, log_file_path) try: record_process_sample(project=project, sample=sample, workflow_subtask=workflow_subtask, analysis_module_name="piper_ngi", analysis_dir=project.analysis_dir, pid=p_handle.pid) except RuntimeError as e: LOG.error(e) continue except (NotImplementedError, RuntimeError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg) else: LOG.info('Sample "{}" in project "{}" is not yet ready for ' 'processing.'.format(sample, project))
def test_create_log_file_path(self): got_path = utils.create_log_file_path(self.workflow_subtask, self.project_base_path, self.project_name, self.project_id, sample_id=self.sample_id, libprep_id=self.libprep_id, seqrun_id=self.seqrun_id) expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask.log'.format( self.tmp_dir) self.assertEqual(got_path, expected_path)
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs) except RuntimeError as e: # may want to process anyway. raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError(('"exec_mode" param must be one of "sbatch" or "local" ') ('value was "{}"'.format(exec_mode))) modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) LOG.info('Sample "{}" in project "{}" is ready for processing.'.format(sample, project)) for workflow_subtask in workflows.get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) setup_xml_cl, setup_xml_path = build_setup_xml(project=project, sample=sample, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode) remove_previous_sample_analyses(project) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample([setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs) for x in xrange(10): # Time delay to let sbatch get its act together (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: ## FIXME Now this is broken again raise NotImplementedError("Sorry dude it's a no-go") slurm_job_id = None launch_piper_job(setup_xml_cl, project) process_handle = launch_piper_job(piper_cl, project) process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error('Could not record process for project/sample ' '{}/{}, workflow {}'.format(project, sample, workflow_subtask)) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg)
def analyze(analysis_object, level='sample', config=None, config_file_path=None): """Analyze data at the sample level. :param NGIAnalysis analysis_object: holds all the parameters for the analysis :raises ValueError: If exec_mode is an unsupported value """ charon_session = CharonSession() for sample in analysis_object.project: try: charon_reported_status = charon_session.sample_get( analysis_object.project.project_id, sample).get('analysis_status') # Check Charon to ensure this hasn't already been processed do_analyze = handle_sample_status(analysis_object, sample, charon_reported_status) if not do_analyze: continue except CharonError as e: LOG.error(e) continue if level == "sample": status_field = "alignment_status" elif level == "genotype": status_field = "genotype_status" else: LOG.warn('Unknown workflow level: "{}"'.format(level)) status_field = "alignment_status" # Or should we abort? try: check_for_preexisting_sample_runs( analysis_object.project, sample, analysis_object.restart_running_jobs, analysis_object.restart_finished_jobs, status_field) except RuntimeError as e: raise RuntimeError( 'Aborting processing of project/sample "{}/{}": ' '{}'.format(analysis_object.project, sample, e)) if analysis_object.exec_mode.lower() not in ("sbatch", "local"): raise ValueError( '"exec_mode" param must be one of "sbatch" or "local" ' 'value was "{}"'.format(analysis_object.exec_mode)) if analysis_object.exec_mode == "local": modules_to_load = analysis_object.config.get("piper", {}).get( "load_modules", []) load_modules(modules_to_load) for workflow_subtask in workflows.get_subtasks_for_level(level=level): if level == "genotype": genotype_status = None # Some records in Charon lack this field, I'm guessing try: charon_session = CharonSession() genotype_status = charon_session.sample_get( projectid=analysis_object.project.project_id, sampleid=sample.name).get("genotype_status") except CharonError as e: LOG.error( 'Couldn\'t determine genotyping status for project/' 'sample "{}/{}"; skipping analysis.'.format( analysis_object.project, sample)) continue if find_previous_genotype_analyses( analysis_object.project, sample) or genotype_status == "DONE": if not analysis_object.restart_finished_jobs: LOG.info( 'Project/sample "{}/{}" has completed genotype ' 'analysis previously; skipping (use flag to force ' 'analysis)'.format(analysis_object.project, sample)) continue if analysis_object.restart_running_jobs: # Kill currently-running jobs if they exist kill_running_sample_analysis( workflow_subtask=workflow_subtask, project_id=analysis_object.project.project_id, sample_id=sample.name) # This checks the local jobs database if not is_sample_analysis_running_local( workflow_subtask=workflow_subtask, project_id=analysis_object.project.project_id, sample_id=sample.name): LOG.info('Launching "{}" analysis for sample "{}" in project ' '"{}"'.format(workflow_subtask, sample, analysis_object.project)) try: log_file_path = create_log_file_path( workflow_subtask=workflow_subtask, project_base_path=analysis_object.project.base_path, project_name=analysis_object.project.dirname, project_id=analysis_object.project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path( workflow_subtask=workflow_subtask, project_base_path=analysis_object.project.base_path, project_name=analysis_object.project.dirname, project_id=analysis_object.project.project_id, sample_id=sample.name) if level == "sample": if not analysis_object.keep_existing_data: remove_previous_sample_analyses( analysis_object.project, sample) default_files_to_copy = None elif level == "genotype": if not analysis_object.keep_existing_data: remove_previous_genotype_analyses( analysis_object.project) default_files_to_copy = None # Update the project to keep only valid fastq files for setup.xml creation if level == "genotype": updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(analysis_object.project, sample, restart_finished_jobs=True, status_field="genotype_status") else: updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(analysis_object.project, sample, analysis_object.restart_finished_jobs, status_field="alignment_status") setup_xml_cl, setup_xml_path = build_setup_xml( project=updated_project, sample=sample, workflow=workflow_subtask, local_scratch_mode=( analysis_object.exec_mode == "sbatch"), config=analysis_object.config) piper_cl = build_piper_cl( project=analysis_object.project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=analysis_object.config, exec_mode=analysis_object.exec_mode, generate_bqsr_bam=analysis_object.generate_bqsr_bam) if analysis_object.exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample( [setup_xml_cl, piper_cl], workflow_subtask, analysis_object.project, sample, restart_finished_jobs=analysis_object. restart_finished_jobs, files_to_copy=default_files_to_copy) for x in xrange(10): # Time delay to let sbatch get its act together # (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(analysis_object.project, sample, slurm_job_id)) else: # "local" raise NotImplementedError( 'Local execution not currently implemented. ' 'I\'m sure Denis can help you with this.') #slurm_job_id = None #launch_piper_job(setup_xml_cl, project) #process_handle = launch_piper_job(piper_cl, project) #process_id = process_handle.pid try: record_process_sample( project=analysis_object.project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error(e) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ( 'Processing project "{}" / sample "{}" / workflow "{}" ' 'failed: {}'.format(analysis_object.project, sample, workflow_subtask, e)) LOG.error(error_msg)
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, keep_existing_data=False, level="sample", genotype_file=None, config=None, config_file_path=None, generate_bqsr_bam=False): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" (local not implemented) :param bool restart_finished_jobs: Restart jobs that are already done (have a .done file) :param bool restart_running_jobs: Kill and restart currently-running jobs :param str level: The level on which to perform the analysis ("sample" or "genotype") :param str genotype_file: The path to the genotype file (only relevant for genotype analysis) :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ if level == "sample": status_field = "alignment_status" elif level == "genotype": status_field = "genotype_status" else: LOG.warn('Unknown workflow level: "{}"'.format(level)) status_field = "alignment_status" # Or should we abort? try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs, status_field) except RuntimeError as e: raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError('"exec_mode" param must be one of "sbatch" or "local" ' 'value was "{}"'.format(exec_mode)) if exec_mode == "local": modules_to_load = config.get("piper", {}).get("load_modules", []) load_modules(modules_to_load) for workflow_subtask in workflows.get_subtasks_for_level(level=level): if level == "genotype": genotype_status = None # Some records in Charon lack this field, I'm guessing try: charon_session = CharonSession() genotype_status = charon_session.sample_get(projectid=project.project_id, sampleid=sample.name).get("genotype_status") except CharonError as e: LOG.error('Couldn\'t determine genotyping status for project/' 'sample "{}/{}"; skipping analysis.'.format(project, sample)) continue if find_previous_genotype_analyses(project, sample) or genotype_status == "DONE": if not restart_finished_jobs: LOG.info('Project/sample "{}/{}" has completed genotype ' 'analysis previously; skipping (use flag to force ' 'analysis)'.format(project, sample)) continue if restart_running_jobs: # Kill currently-running jobs if they exist kill_running_sample_analysis(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name) # This checks the local jobs database if not is_sample_analysis_running_local(workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): LOG.info('Launching "{}" analysis for sample "{}" in project ' '"{}"'.format(workflow_subtask, sample, project)) try: log_file_path = create_log_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path(workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) if level == "sample": if not keep_existing_data: remove_previous_sample_analyses(project, sample) default_files_to_copy=None elif level == "genotype": if not keep_existing_data: remove_previous_genotype_analyses(project) default_files_to_copy=None # Update the project to keep only valid fastq files for setup.xml creation if level == "genotype": updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(project, sample, restart_finished_jobs=True, status_field="genotype_status") else: updated_project, default_files_to_copy = \ collect_files_for_sample_analysis(project, sample, restart_finished_jobs, status_field="alignment_status") setup_xml_cl, setup_xml_path = build_setup_xml(project=updated_project, sample=sample, workflow=workflow_subtask, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode, generate_bqsr_bam=generate_bqsr_bam) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample([setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs, files_to_copy=default_files_to_copy) for x in xrange(10): # Time delay to let sbatch get its act together # (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: # "local" raise NotImplementedError('Local execution not currently implemented. ' 'I\'m sure Denis can help you with this.') #slurm_job_id = None #launch_piper_job(setup_xml_cl, project) #process_handle = launch_piper_job(piper_cl, project) #process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error(e) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" / workflow "{}" ' 'failed: {}'.format(project, sample, workflow_subtask, e)) LOG.error(error_msg)
def analyze(project, sample, exec_mode="sbatch", restart_finished_jobs=False, restart_running_jobs=False, config=None, config_file_path=None): """Analyze data at the sample level. :param NGIProject project: the project to analyze :param NGISample sample: the sample to analyzed :param str exec_mode: "sbatch" or "local" :param dict config: The parsed configuration file (optional) :param str config_file_path: The path to the configuration file (optional) :raises ValueError: If exec_mode is an unsupported value """ try: check_for_preexisting_sample_runs(project, sample, restart_running_jobs, restart_finished_jobs) except RuntimeError as e: # may want to process anyway. raise RuntimeError('Aborting processing of project/sample "{}/{}": ' '{}'.format(project, sample, e)) if exec_mode.lower() not in ("sbatch", "local"): raise ValueError( ('"exec_mode" param must be one of "sbatch" or "local" ')( 'value was "{}"'.format(exec_mode))) modules_to_load = ["java/sun_jdk1.7.0_25", "R/2.15.0"] load_modules(modules_to_load) LOG.info('Sample "{}" in project "{}" is ready for processing.'.format( sample, project)) for workflow_subtask in workflows.get_subtasks_for_level(level="sample"): if not is_sample_analysis_running_local( workflow_subtask=workflow_subtask, project_id=project.project_id, sample_id=sample.name): try: log_file_path = create_log_file_path( workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) rotate_file(log_file_path) exit_code_path = create_exit_code_file_path( workflow_subtask=workflow_subtask, project_base_path=project.base_path, project_name=project.dirname, project_id=project.project_id, sample_id=sample.name) setup_xml_cl, setup_xml_path = build_setup_xml( project=project, sample=sample, local_scratch_mode=(exec_mode == "sbatch"), config=config) piper_cl = build_piper_cl(project=project, workflow_name=workflow_subtask, setup_xml_path=setup_xml_path, exit_code_path=exit_code_path, config=config, exec_mode=exec_mode) remove_previous_sample_analyses(project) if exec_mode == "sbatch": process_id = None slurm_job_id = sbatch_piper_sample( [setup_xml_cl, piper_cl], workflow_subtask, project, sample, restart_finished_jobs=restart_finished_jobs) for x in xrange( 10 ): # Time delay to let sbatch get its act together (takes a few seconds to be visible with sacct) try: get_slurm_job_status(slurm_job_id) break except ValueError: time.sleep(2) else: LOG.error('sbatch file for sample {}/{} did not ' 'queue properly! Job ID {} cannot be ' 'found.'.format(project, sample, slurm_job_id)) else: ## FIXME Now this is broken again raise NotImplementedError("Sorry dude it's a no-go") slurm_job_id = None launch_piper_job(setup_xml_cl, project) process_handle = launch_piper_job(piper_cl, project) process_id = process_handle.pid try: record_process_sample(project=project, sample=sample, analysis_module_name="piper_ngi", slurm_job_id=slurm_job_id, process_id=process_id, workflow_subtask=workflow_subtask) except RuntimeError as e: LOG.error('Could not record process for project/sample ' '{}/{}, workflow {}'.format( project, sample, workflow_subtask)) ## Question: should we just kill the run in this case or let it go? continue except (NotImplementedError, RuntimeError, ValueError) as e: error_msg = ('Processing project "{}" / sample "{}" failed: ' '{}'.format(project, sample, e.__repr__())) LOG.error(error_msg)