def recreate_project_from_db(analysis_top_dir, project_name, project_id): project_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_obj = NGIProject(name=project_name, dirname=project_name, project_id=project_id, base_path=analysis_top_dir) charon_session = CharonSession() try: samples_dict = charon_session.project_get_samples(project_id)["samples"] except CharonError as e: raise RuntimeError("Could not access samples for project {}: {}".format(project_id, e)) for sample in samples_dict: sample_id = sample.get("sampleid") sample_dir = os.path.join(project_dir, sample_id) sample_obj = project_obj.add_sample(name=sample_id, dirname=sample_id) sample_obj.status = sample.get("status", "unknown") try: libpreps_dict = charon_session.sample_get_libpreps(project_id, sample_id)["libpreps"] except CharonError as e: raise RuntimeError("Could not access libpreps for project {} / sample {}: {}".format(project_id,sample_id, e)) for libprep in libpreps_dict: libprep_id = libprep.get("libprepid") libprep_obj = sample_obj.add_libprep(name=libprep_id, dirname=libprep_id) libprep_obj.status = libprep.get("status", "unknown") try: seqruns_dict = charon_session.libprep_get_seqruns(project_id, sample_id, libprep_id)["seqruns"] except CharonError as e: raise RuntimeError("Could not access seqruns for project {} / sample {} / " "libprep {}: {}".format(project_id, sample_id, libprep_id, e)) for seqrun in seqruns_dict: # e.g. 140528_D00415_0049_BC423WACXX seqrun_id = seqrun.get("seqrunid") seqrun_obj = libprep_obj.add_seqrun(name=seqrun_id, dirname=seqrun_id) seqrun_obj.status = seqrun.get("status", "unknown") return project_obj
def test_create_charon_entries_from_project(self): # Create the NGIObjects project_obj = NGIProject(name=self.p_name, dirname=self.p_name, project_id=self.p_id, base_path=self.p_bp) sample_obj = project_obj.add_sample(name=self.s_id, dirname=self.s_id) libprep_obj = sample_obj.add_libprep(name=self.l_id, dirname=self.l_id) seqrun_obj = libprep_obj.add_seqrun(name=self.sr_id, dirname=self.sr_id) try: # Create them in the db create_charon_entries_from_project(project_obj) finally: charon_session = CharonSession() charon_session.project_delete(project_obj.project_id)
def create_project_obj_from_analysis_log(project_name, project_id, project_base_path, sample_id, workflow): """Using the log of seqruns used for a sample analysis, recreate a project object with relevant sample, libprep, and seqrun objects. """ analysis_log_filename = "{}-{}-{}.files".format(project_id, sample_id, workflow) analysis_log_path = os.path.join(project_base_path, "ANALYSIS", project_id, "piper_ngi", "logs", analysis_log_filename) with open(analysis_log_path, 'r') as f: analysis_dict = yaml.load(f) project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=project_base_path) sample_obj = project_obj.add_sample(sample_id, sample_id) for libprep_name, seqrun_dict in analysis_dict[project_id][sample_id].items(): libprep_obj = sample_obj.add_libprep(libprep_name, libprep_name) for seqrun_name in seqrun_dict.keys(): libprep_obj.add_seqrun(seqrun_name, seqrun_name) return project_obj
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config["quiet"] = quiet # Hack because I enter here from a script sometimes pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches=re.match(pattern, fc_dir) if matches: flowcell_root=matches.group(1) else: LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error('Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warn('Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = filter(pattern.match, sample.get('files', [])) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to parse from SampleSheet try: if not samplesheet_path: raise ValueError() lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0] libprep_name = determine_library_prep_from_samplesheet(samplesheet_path, project_original_name, sample_name, lane_num) except (IndexError, ValueError) as e: LOG.debug('Unable to determine library prep from sample sheet file ' '("{}"); try to determine from Charon'.format(e)) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name, fallback_libprep)) else: error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files] seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir)) try: do_symlink(src_fastq_files, seqrun_dir) except OSError: error_text = ('Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format(project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs) if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.") # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something") fastq_files_to_analyze = [] # Create a new NGIProject object (the old one could still be in use elsewhere) # Fix this later I've been coding for too long proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split(base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### BAM / ALIGNMENT QC # Access the filesystem to determine which alignment (bam) files are available. # If there are any, add them to the list of files to include in the new analysis. # Include alignment qc files. project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname) project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments") project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc") sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(sample_name=sample_obj.name) aln_files_to_copy = glob.glob(os.path.join(project_aln_dir, sample_analysis_file_pattern)) qc_files_to_copy = glob.glob(os.path.join(project_alnqc_dir, sample_analysis_file_pattern)) return (proj_obj, aln_files_to_copy, qc_files_to_copy)
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info( "Setting up analysis for demultiplexed data in source folder \"{}\"". format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config[ "quiet"] = quiet # Hack because I enter here from a script sometimes #Checks flowcell path to establish which group owns it pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches = re.match(pattern, fc_dir) if matches: flowcell_uppnexid = matches.group(1) else: LOG.error( "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to" .format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath( os.path.join(config["analysis"]["base_root"], flowcell_uppnexid, config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error( 'Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join( analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format( fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warning( "No projects found in specified flowcell directory \"{}\"".format( fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq samplesheet_sample_numbers = get_sample_numbers_from_samplesheet( samplesheet_path) if samplesheet_path else None try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warning( 'Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug( "Skipping project {} (not in restrict_to_projects)".format( project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = list(filter(pattern.match, sample.get('files', []))) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to use assignment from SampleSheet samplesheet_sample = match_fastq_sample_number_to_samplesheet( fq_file, samplesheet_sample_numbers, project_id) if samplesheet_sample is not None and \ samplesheet_sample[6] is not None: libprep_name = samplesheet_sample[6] else: LOG.debug( 'Unable to determine library prep from sample sheet file; try to determine from Charon' ) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid( project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format( libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps( project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format( project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) else: error_text = ( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [ os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files ] seqrun_dst_dir = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info( "Symlinking fastq files from {} to {}...".format( src_sample_dir, seqrun_dst_dir)) try: do_symlink(src_fastq_files, seqrun_dst_dir) except OSError: error_text = ( 'Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format( project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
class TestPiperUtils(unittest.TestCase): @classmethod def setUpClass(self): self.tmp_dir = tempfile.mkdtemp() self.workflow_subtask = 'subtask' self.project_base_path = self.tmp_dir self.project_name = 'S.One_20_12' self.project_id = 'P123' self.sample_id = 'P123_1001' self.libprep_id = 'A' self.seqrun_id = 'seqrun' self.project_obj = NGIProject(self.project_name, self.project_id, self.project_id, self.project_base_path) self.sample_obj = self.project_obj.add_sample(self.sample_id, self.sample_id) @classmethod def tearDownClass(self): shutil.rmtree(self.tmp_dir) def test_find_previous_genotype_analyses(self): project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', '01_genotype_concordance') os.makedirs(project_dir) sample_file = os.path.join(project_dir, 'P123_1001.gtc') open(sample_file, 'w').close() previous_analysis_not_done = utils.find_previous_genotype_analyses( self.project_obj, self.sample_obj) self.assertFalse(previous_analysis_not_done) sample_done_file = os.path.join(project_dir, '.P123_1001.gtc.done') open(sample_done_file, 'w').close() previous_analysis_done = utils.find_previous_genotype_analyses( self.project_obj, self.sample_obj) self.assertTrue(previous_analysis_done) shutil.rmtree( project_dir) # Remove dir or it will interfere with other tests @mock.patch('ngi_pipeline.engines.piper_ngi.utils.os.remove') def test_remove_previous_genotype_analyses(self, mock_remove): project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', '02_genotype_concordance') os.makedirs(project_dir) sample_file = os.path.join(project_dir, 'P123-1001.gtc') open(sample_file, 'w').close() utils.remove_previous_genotype_analyses(self.project_obj) mock_remove.assert_called_once_with(sample_file) @mock.patch( 'ngi_pipeline.engines.piper_ngi.utils.find_previous_sample_analyses') @mock.patch('ngi_pipeline.engines.piper_ngi.utils.os.remove') def test_remove_previous_sample_analyses(self, mock_remove, mock_find): file_to_remove = os.path.join(self.tmp_dir, 'a_file') open(file_to_remove, 'w').close() mock_find.return_value = [file_to_remove] utils.remove_previous_sample_analyses(self.project_obj) mock_remove.assert_called_once_with(file_to_remove) def test_find_previous_sample_analyses(self): project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', '01_files') os.makedirs(project_dir) sample_file = os.path.join(project_dir, 'P123_1001.out') open(sample_file, 'w').close() got_sample_files = utils.find_previous_sample_analyses( self.project_obj) self.assertEqual(got_sample_files, [sample_file]) @mock.patch('ngi_pipeline.engines.piper_ngi.utils.datetime') @mock.patch('ngi_pipeline.engines.piper_ngi.utils.shutil.move') def test_rotate_previous_analysis(self, mock_move, mock_datetime): mock_datetime.datetime.now( ).strftime.return_value = '2020-11-13_09:30:12:640314' analysis_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', '03_raw_alignments') os.makedirs(analysis_dir) sample_file = os.path.join(analysis_dir, 'P123-1001.bam') open(sample_file, 'w').close() utils.rotate_previous_analysis(self.project_obj) rotated_file = '{}/ANALYSIS/P123/piper_ngi/previous_analyses/2020-11-13_09:30:12:640314/03_raw_alignments'.format( self.tmp_dir) mock_move.assert_called_once_with(sample_file, rotated_file) @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession') def test_get_finished_seqruns_for_sample(self, mock_charon): mock_charon().sample_get_libpreps.return_value = { 'libpreps': [{ 'qc': 'PASS', 'libprepid': 'A' }] } mock_charon().libprep_get_seqruns.return_value = { 'seqruns': [{ 'seqrunid': 'B' }] } mock_charon().seqrun_get.return_value = {'alignment_status': 'DONE'} got_libpreps = utils.get_finished_seqruns_for_sample( self.project_id, self.sample_id) expected_libpreps = {'A': ['B']} self.assertEqual(got_libpreps, expected_libpreps) @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession') def test_get_valid_seqruns_for_sample(self, mock_charon): mock_charon().sample_get_libpreps.return_value = { 'libpreps': [{ 'qc': 'PASS', 'libprepid': 'A' }] } mock_charon().libprep_get_seqruns.return_value = { 'seqruns': [{ 'seqrunid': 'B' }] } got_libpreps = utils.get_valid_seqruns_for_sample( self.project_id, self.sample_id) expected_libpreps = {'A': ['B']} self.assertEqual(got_libpreps, expected_libpreps) def test_record_analysis_details(self): job_identifier = 'job_id' utils.record_analysis_details(self.project_obj, job_identifier) output_file_path = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', 'logs', 'job_id.files') with open(output_file_path, 'r') as f: got_content = yaml.load(f, Loader=yaml.FullLoader) expected_content = {'P123': {'P123_1001': {}}} self.assertEqual(got_content, expected_content) def test_create_project_obj_from_analysis_log(self): log_path = os.path.join(self.project_base_path, 'ANALYSIS', self.project_id, 'piper_ngi', 'logs') os.makedirs(log_path) log_file = os.path.join(log_path, 'P123-P123_1001-workflow.files') log_content = ['{P123: {P123_1001: {}}}'] with open(log_file, 'w') as f: f.write('\n'.join(log_content)) got_project_obj = utils.create_project_obj_from_analysis_log( self.project_name, self.project_id, self.project_base_path, self.sample_id, 'workflow') self.assertEqual(got_project_obj, self.project_obj) @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession') def test_check_for_preexisting_sample_runs(self, mock_charon): mock_charon().sample_get_libpreps.return_value = { 'libpreps': [{ 'libprepid': 'A' }] } mock_charon().libprep_get_seqruns.return_value = { 'seqruns': [{ 'seqrunid': 'B' }] } mock_charon().seqrun_get.return_value = {'alignment_status': 'RUNNING'} restart_running_jobs = False restart_finished_jobs = False with self.assertRaises(RuntimeError): utils.check_for_preexisting_sample_runs(self.project_obj, self.sample_obj, restart_running_jobs, restart_finished_jobs) def test_create_sbatch_header(self): got_header = utils.create_sbatch_header('slurm_project_id', 'slurm_queue', 17, 'slurm_time', 'job_name', 'slurm_out_log', 'slurm_err_log') expected_header = """#!/bin/bash -l #SBATCH -A slurm_project_id #SBATCH -p slurm_queue #SBATCH -n 16 #SBATCH -t slurm_time #SBATCH -J job_name #SBATCH -o slurm_out_log #SBATCH -e slurm_err_log """ self.assertEqual(got_header, expected_header) def test_add_exit_code_recording(self): cl = ['echo', 'Hello!'] exit_code_path = '/some/path' got_cl = utils.add_exit_code_recording(cl, exit_code_path) expected_cl = 'echo Hello!; echo $? > /some/path' self.assertEqual(got_cl, expected_cl) def test_create_log_file_path(self): got_path = utils.create_log_file_path(self.workflow_subtask, self.project_base_path, self.project_name, self.project_id, sample_id=self.sample_id, libprep_id=self.libprep_id, seqrun_id=self.seqrun_id) expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask.log'.format( self.tmp_dir) self.assertEqual(got_path, expected_path) def test_create_exit_code_file_path(self): got_path = utils.create_exit_code_file_path(self.workflow_subtask, self.project_base_path, self.project_name, self.project_id, sample_id=self.sample_id, libprep_id=self.libprep_id, seqrun_id=self.seqrun_id) expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask.exit'.format( self.tmp_dir) self.assertEqual(got_path, expected_path) def test__create_generic_output_file_path(self): got_path = utils._create_generic_output_file_path( self.workflow_subtask, self.project_base_path, self.project_name, self.project_id, sample_id=self.sample_id, libprep_id=self.libprep_id, seqrun_id=self.seqrun_id) expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask'.format( self.tmp_dir) self.assertEqual(got_path, expected_path)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split( base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get( fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split(base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)
def recreate_project_from_filesystem(project_dir, restrict_to_samples=None, restrict_to_libpreps=None, restrict_to_seqruns=None, force_create_project=False, config=None, config_file_path=None): """Recreates the full project/sample/libprep/seqrun set of NGIObjects using the directory tree structure.""" from ngi_pipeline.database.classes import CharonError from ngi_pipeline.database.communicate import get_project_id_from_name if not restrict_to_samples: restrict_to_samples = [] if not restrict_to_libpreps: restrict_to_libpreps = [] if not restrict_to_seqruns: restrict_to_seqruns = [] project_dir = locate_project(project_dir) if os.path.islink(os.path.abspath(project_dir)): real_project_dir = os.path.realpath(project_dir) syml_project_dir = os.path.abspath(project_dir) else: real_project_dir = os.path.abspath(project_dir) search_dir = os.path.join(os.path.dirname(project_dir), "*") sym_files = filter(os.path.islink, glob.glob(search_dir)) for sym_file in sym_files: if os.path.realpath(sym_file) == os.path.realpath(real_project_dir): syml_project_dir = os.path.abspath(sym_file) break else: syml_project_dir = None project_base_path, project_id = os.path.split(real_project_dir) if syml_project_dir: project_base_path, project_name = os.path.split(syml_project_dir) else: # project name is the same as project id (Uppsala perhaps) project_name = project_id if os.path.split(project_base_path)[1] == "DATA": project_base_path = os.path.split(project_base_path)[0] LOG.info('Setting up project "{}"'.format(project_id)) project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=project_base_path) samples_pattern = os.path.join(real_project_dir, "*") samples = filter(os.path.isdir, glob.glob(samples_pattern)) if not samples: LOG.warn('No samples found for project "{}"'.format(project_obj)) for sample_dir in samples: sample_name = os.path.basename(sample_dir) if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug('Skipping sample "{}": not in specified samples ' '"{}"'.format(sample_name, ', '.join(restrict_to_samples))) continue LOG.info('Setting up sample "{}"'.format(sample_name)) sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) libpreps_pattern = os.path.join(sample_dir, "*") libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern)) if not libpreps: LOG.warn('No libpreps found for sample "{}"'.format(sample_obj)) for libprep_dir in libpreps: libprep_name = os.path.basename(libprep_dir) if restrict_to_libpreps and libprep_name not in restrict_to_libpreps: LOG.debug('Skipping libprep "{}": not in specified libpreps ' '"{}"'.format(libprep_name, ', '.join(restrict_to_libpreps))) continue LOG.info('Setting up libprep "{}"'.format(libprep_name)) libprep_obj = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*") seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern)) if not seqruns: LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj)) for seqrun_dir in seqruns: seqrun_name = os.path.basename(seqrun_dir) if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns: LOG.debug('Skipping seqrun "{}": not in specified seqruns ' '"{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns))) continue LOG.info('Setting up seqrun "{}"'.format(seqrun_name)) seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name, dirname=seqrun_name) for fq_file in fastq_files_under_dir(seqrun_dir, realpath=False): fq_name = os.path.basename(fq_file) LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj)) seqrun_obj.add_fastq_files([fq_name]) return project_obj
def recreate_project_from_filesystem(project_dir, restrict_to_samples=None, restrict_to_libpreps=None, restrict_to_seqruns=None, force_create_project=False, config=None, config_file_path=None): """Recreates the full project/sample/libprep/seqrun set of NGIObjects using the directory tree structure.""" from ngi_pipeline.database.classes import CharonError from ngi_pipeline.database.communicate import get_project_id_from_name if not restrict_to_samples: restrict_to_samples = [] if not restrict_to_libpreps: restrict_to_libpreps = [] if not restrict_to_seqruns: restrict_to_seqruns = [] if os.path.islink(os.path.abspath(project_dir)): real_project_dir = os.path.realpath(project_dir) syml_project_dir = os.path.abspath(project_dir) else: real_project_dir = os.path.abspath(project_dir) search_dir = os.path.join(os.path.dirname(project_dir), "*") sym_files = filter(os.path.islink, glob.glob(search_dir)) for sym_file in sym_files: if os.path.realpath(sym_file) == os.path.realpath(real_project_dir): syml_project_dir = os.path.abspath(sym_file) break else: syml_project_dir = None project_id = os.path.split(real_project_dir)[1] if syml_project_dir: project_name = os.path.split(syml_project_dir)[1] else: # project name is the same as project id (Uppsala perhaps) project_name = project_id LOG.info('Setting up project "{}"'.format(project_id)) project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=config["analysis"]["top_dir"]) samples_pattern = os.path.join(real_project_dir, "*") samples = filter(os.path.isdir, glob.glob(samples_pattern)) if not samples: LOG.warn('No samples found for project "{}"'.format(project_obj)) for sample_dir in samples: sample_name = os.path.basename(sample_dir) if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples))) continue LOG.info('Setting up sample "{}"'.format(sample_name)) sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) libpreps_pattern = os.path.join(sample_dir, "*") libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern)) if not libpreps: LOG.warn('No libpreps found for sample "{}"'.format(sample_obj)) for libprep_dir in libpreps: libprep_name = os.path.basename(libprep_dir) if restrict_to_libpreps and libprep_name not in restrict_to_libpreps: LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps))) continue LOG.info('Setting up libprep "{}"'.format(libprep_name)) libprep_obj = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*") seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern)) if not seqruns: LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj)) for seqrun_dir in seqruns: seqrun_name = os.path.basename(seqrun_dir) if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns: LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns))) continue LOG.info('Setting up seqrun "{}"'.format(seqrun_name)) seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name, dirname=seqrun_name) for fq_file in fastq_files_under_dir(seqrun_dir): fq_name = os.path.basename(fq_file) LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj)) seqrun_obj.add_fastq_files([fq_name]) return project_obj
def recreate_project_from_filesystem(project_dir, restrict_to_samples=None, restrict_to_libpreps=None, restrict_to_seqruns=None): """Recreates the full project/sample/libprep/seqrun set of NGIObjects using the directory tree structure.""" if not restrict_to_samples: restrict_to_samples = [] if not restrict_to_libpreps: restrict_to_libpreps = [] if not restrict_to_seqruns: restrict_to_seqruns = [] base_path, project_name = os.path.split(project_dir) if not project_name: base_path, project_name = os.path.split(base_path) LOG.info('Setting up project "{}"'.format(project_name)) try: # This requires Charon access -- maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) # Should handle requests.exceptions.Timeout in Charon classes except (CharonError, ValueError, Timeout) as e: error_msg = ('Cannot proceed with project "{}" due to ' 'Charon-related error: {}'.format(project_name, e)) raise CharonError(error_msg) project_obj = NGIProject(name=project_name, dirname=project_name, project_id=project_id, base_path=base_path) samples_pattern = os.path.join(project_dir, "*") samples = filter(os.path.isdir, glob.glob(samples_pattern)) if not samples: LOG.warn('No samples found for project "{}"'.format(project_obj)) for sample_dir in samples: sample_name = os.path.basename(sample_dir) if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples))) continue LOG.info('Setting up sample "{}"'.format(sample_name)) sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) libpreps_pattern = os.path.join(sample_dir, "*") libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern)) if not libpreps: LOG.warn('No libpreps found for sample "{}"'.format(sample_obj)) for libprep_dir in libpreps: libprep_name = os.path.basename(libprep_dir) if restrict_to_libpreps and libprep_name not in restrict_to_libpreps: LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps))) continue LOG.info('Setting up libprep "{}"'.format(libprep_name)) libprep_obj = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*") seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern)) if not seqruns: LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj)) for seqrun_dir in seqruns: seqrun_name = os.path.basename(seqrun_dir) if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns: LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns))) continue LOG.info('Setting up seqrun "{}"'.format(seqrun_name)) seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name, dirname=seqrun_name) pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") all_files = glob.glob(os.path.join(seqrun_dir, "*")) fastq_files = filter(os.path.isfile, filter(pattern.match, all_files)) for fq_file in fastq_files: fq_name = os.path.basename(fq_file) LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj)) seqrun_obj.add_fastq_files([fq_name]) return project_obj
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = get_valid_seqruns_for_sample( project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs) if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.") # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something") fastq_files_to_analyze = [] # Create a new NGIProject object (the old one could still be in use elsewhere) # Fix this later I've been coding for too long proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split( base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get( fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### BAM / ALIGNMENT QC # Access the filesystem to determine which alignment (bam) files are available. # If there are any, add them to the list of files to include in the new analysis. # Include alignment qc files. project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname) project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments") project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc") sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format( sample_name=sample_obj.name) aln_files_to_copy = glob.glob( os.path.join(project_aln_dir, sample_analysis_file_pattern)) qc_files_to_copy = glob.glob( os.path.join(project_alnqc_dir, sample_analysis_file_pattern)) return (proj_obj, aln_files_to_copy, qc_files_to_copy)
class TestCharonFunctions(unittest.TestCase): def setUp(self): # Details self.project_id = 'P100001' self.project_name = 'S.One_20_02' self.project_path = '/some/path' self.sample_id = 'P100001_101' self.libprep_id = 'A' self.seqrun_id = '201030_A00187_0332_AHFCFLDSXX' # Objects self.project_obj = NGIProject(name=self.project_name, dirname=self.project_name, project_id=self.project_id, base_path=self.project_path) self.sample_obj = self.project_obj.add_sample(name=self.sample_id, dirname=self.sample_id) self.libprep_obj = self.sample_obj.add_libprep(name=self.libprep_id, dirname=self.libprep_id) self.seqrun_obj = self.libprep_obj.add_seqrun(name=self.seqrun_id, dirname=self.seqrun_id) @mock.patch.dict(os.environ, { 'CHARON_BASE_URL': 'charon-url', 'CHARON_API_TOKEN': 'token' }) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_create' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_create') @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_create' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_create') def test_create_charon_entries_from_project(self, mock_seqrun, mock_libprep, mock_sample, mock_proj): create_charon_entries_from_project(self.project_obj) mock_proj.assert_called_once_with( best_practice_analysis='whole_genome_reseq', name='S.One_20_02', projectid='P100001', sequencing_facility='NGI-S', status='OPEN') mock_sample.assert_called_once_with(analysis_status='TO_ANALYZE', projectid='P100001', sampleid='P100001_101') mock_libprep.assert_called_once_with(libprepid='A', projectid='P100001', qc='PASSED', sampleid='P100001_101') mock_seqrun.assert_called_once_with( alignment_status='NOT_RUNNING', libprepid='A', mean_autosomal_coverage=0, projectid='P100001', sampleid='P100001_101', seqrunid='201030_A00187_0332_AHFCFLDSXX', total_reads=0) @mock.patch.dict(os.environ, { 'CHARON_BASE_URL': 'charon-url', 'CHARON_API_TOKEN': 'token' }) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_create' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_update' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_create') @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_update') @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_create' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_update' ) @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_create') @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_update') def test_create_charon_entries_from_project_update( self, mock_seqrun_ud, mock_seqrun_cr, mock_libprep_ud, mock_libprep_cr, mock_sample_ud, mock_sample_cr, mock_project_ud, mock_project_cr): # Not the neatest of tests but gets the job done... mock_project_cr.side_effect = CharonError('Error', status_code=400) mock_sample_cr.side_effect = CharonError('Error', status_code=400) mock_libprep_cr.side_effect = CharonError('Error', status_code=400) mock_seqrun_cr.side_effect = CharonError('Error', status_code=400) create_charon_entries_from_project(self.project_obj, force_overwrite=True) mock_project_ud.assert_called_once_with( best_practice_analysis='whole_genome_reseq', name='S.One_20_02', projectid='P100001', sequencing_facility='NGI-S', status='OPEN') mock_sample_ud.assert_called_once_with(analysis_status='TO_ANALYZE', projectid='P100001', sampleid='P100001_101', status='STALE') mock_libprep_ud.assert_called_once_with(libprepid='A', projectid='P100001', qc='PASSED', sampleid='P100001_101') mock_seqrun_ud.assert_called_once_with( alignment_status='NOT_RUNNING', libprepid='A', mean_autosomal_coverage=0, projectid='P100001', sampleid='P100001_101', seqrunid='201030_A00187_0332_AHFCFLDSXX', total_reads=0)