def test_find_previous_sample_analyses(self): project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123', 'piper_ngi', '01_files') os.makedirs(project_dir) sample_file = os.path.join(project_dir, 'P123_1001.out') open(sample_file, 'w').close() got_sample_files = utils.find_previous_sample_analyses( self.project_obj) self.assertEqual(got_sample_files, [sample_file])
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split( base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get( fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, restart_finished_jobs=False, status_field="alignment_status"): """This function finds all data files relating to a sample and follows a preset decision path to decide which of them to include in a sample-level analysis. This can include fastq files, bam files, and alignment-qc-level files. Doesn't modify existing project or sample objects; returns new copies. :param NGIProject project_obj: The NGIProject object to process :param NGISample sample_obj: The NGISample object to process :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False) :param str status_field: Which Charon status field to check (alignment, genotype) :returns: A new NGIProject object, a list of alignment and qc files :rtype: NGIProject, list, list :raises ValueError: If there are no valid libpreps, seqruns, or fastq files """ ### FASTQ # Access the filesystem to determine what fastq files are available # For each file, validate it. # This funtion goes into Charon and finds all valid libpreps and seqruns, # dvs libpreps for which 'qc' != "FAILED" # and seqruns for which 'alignment_status' != "DONE" valid_libprep_seqruns = \ get_valid_seqruns_for_sample(project_id=project_obj.project_id, sample_id=sample_obj.name, include_failed_libpreps=False, include_done_seqruns=restart_finished_jobs, status_field=status_field) if not valid_libprep_seqruns: raise ValueError('No valid libpreps/seqruns found for project/sample ' '"{}/{}"'.format(project_obj, sample_obj)) # Now we find all fastq files that are available and validate them against # the group compiled in the previous step (get_valid_seqruns_for_sample) # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here sample_data_directory = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname) fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False) if not fastq_files_on_filesystem: raise ValueError('No valid fastq files found for project/sample ' '{}/{}'.format(project_obj, sample_obj)) # Create a new NGIProject object (the old one could still be in use elsewhere) proj_obj = NGIProject(project_obj.name, project_obj.dirname, project_obj.project_id, project_obj.base_path) sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname) for fastq_path in fastq_files_on_filesystem: base_path, fastq = os.path.split(fastq_path) if not fastq: base_path, fastq = os.path.split(base_path) # Handles trailing slash base_path, fs_seqrun_name = os.path.split(base_path) base_path, fs_libprep_name = os.path.split(base_path) if fs_libprep_name not in valid_libprep_seqruns.keys(): # Invalid library prep, skip this fastq file continue elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []): continue else: libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name) seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name) seqrun_obj.add_fastq_files(fastq) ### EXISTING DATA # If we still have data here at this point, we'll copy it over. If we had # decided to scrap it, it would have been deleted already. files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj) return (proj_obj, files_to_copy)