def test_get_valid_seqruns_for_sample(self, mock_charon): mock_charon().sample_get_libpreps.return_value = { 'libpreps': [{ 'qc': 'PASS', 'libprepid': 'A' }] } mock_charon().libprep_get_seqruns.return_value = { 'seqruns': [{ 'seqrunid': 'B' }] } got_libpreps = utils.get_valid_seqruns_for_sample( self.project_id, self.sample_id) expected_libpreps = {'A': ['B']} self.assertEqual(got_libpreps, expected_libpreps)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs) if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.") # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something") fastq_files_to_analyze = [] # Create a new NGIProject object (the old one could still be in use elsewhere) # Fix this later I've been coding for too long proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split(base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### BAM / ALIGNMENT QC # Access the filesystem to determine which alignment (bam) files are available. # If there are any, add them to the list of files to include in the new analysis. # Include alignment qc files. project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname) project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments") project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc") sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(sample_name=sample_obj.name) aln_files_to_copy = glob.glob(os.path.join(project_aln_dir, sample_analysis_file_pattern)) qc_files_to_copy = glob.glob(os.path.join(project_alnqc_dir, sample_analysis_file_pattern)) return (proj_obj, aln_files_to_copy, qc_files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split(base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split( base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get( fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = get_valid_seqruns_for_sample( project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs) if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.") # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something") fastq_files_to_analyze = [] # Create a new NGIProject object (the old one could still be in use elsewhere) # Fix this later I've been coding for too long proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split( base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get( fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### BAM / ALIGNMENT QC # Access the filesystem to determine which alignment (bam) files are available. # If there are any, add them to the list of files to include in the new analysis. # Include alignment qc files. project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname) project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments") project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc") sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format( sample_name=sample_obj.name) aln_files_to_copy = glob.glob( os.path.join(project_aln_dir, sample_analysis_file_pattern)) qc_files_to_copy = glob.glob( os.path.join(project_alnqc_dir, sample_analysis_file_pattern)) return (proj_obj, aln_files_to_copy, qc_files_to_copy)