def _make_seq_run(): """ Make a sequencing run pointed at real data for the tests :return: SequencingRun object """ files_1 = model.SequenceFile([ path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz"), path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz"), ]) files_2 = model.SequenceFile([ path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R1_001.fastq.gz"), path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "02-2222_S1_L001_R2_001.fastq.gz"), ]) files_3 = model.SequenceFile([ path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R1_001.fastq.gz"), path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "03-3333_S1_L001_R2_001.fastq.gz"), ]) sample_1 = model.Sample("test_sample", "description", 1) sample_1.sequence_file = files_1 sample_2 = model.Sample("test_sample", "description", 1) sample_2.sequence_file = files_2 sample_3 = model.Sample("test_sample", "description", 1) sample_3.sequence_file = files_3 project = model.Project("test_project", [sample_1, sample_2, sample_3], "description") sequencing_run = model.SequencingRun({"layoutType": "PAIRED_END"}, [project], "miseq") return sequencing_run
def build_sample_list_from_sample_sheet_no_verify(sample_sheet_file): """ Create a list of Sample objects, file existence is not verified before SequenceFile is created this is used when a pre-generated file list is used (e.g. cloud deployment) :param sample_sheet_file: :return: """ sample_list = _parse_samples(sample_sheet_file) for sample in sample_list: sample_dict = sample.get_uploadable_dict() # create file list file_list = [sample_dict['File_Forward']] # if paired end add file to file list paired_end_read = len(sample_dict['File_Reverse']) > 0 if paired_end_read: file_list.append(sample_dict['File_Reverse']) # Create sequence file object and attach to sample sq = model.SequenceFile(file_list=file_list) sample.sequence_file = deepcopy(sq) return sample_list
def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list): """ Creates a list of Sample Objects :param sample_sheet_file: Sample Sheet file :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls) :param run_data_directory_file_list: The list of all files in the data directory :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) for sample in sample_list: properties_dict = _parse_out_sequence_file(sample) # this is the Illumina-defined pattern for naming fastq files, from: # http://blog.basespace.illumina.com/2014/08/18/fastq-upload-in-now-available-in-basespace/ file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format( sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # OK. So we didn't find any files using the **correct** file name # definition according to Illumina. Let's try again with our deprecated # behaviour, where we didn't actually care about the sample number: file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format( sample_name=re.escape(sample.sample_name)) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # we **still** didn't find anything. It's pretty likely, then that # there aren't any fastq files in the directory that match what # the sample sheet says... raise exceptions.SequenceFileError( ("The uploader was unable to find an files with a file name that ends with " ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " "This usually happens when the Illumina MiSeq Reporter tool " "does not generate any FastQ data.").format( sample.sample_name, run_data_directory)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError( ("The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file").format( pf_list, run_data_directory)) # Add the dir to each file to create the full path for i in range(len(pf_list)): pf_list[i] = path.join(run_data_directory, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) return sample_list
def _parse_sample_list(sample_sheet_file): """ Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata :param sample_sheet_file: :return: list of samples """ sample_list = _parse_samples(sample_sheet_file) sample_sheet_dir = path.dirname(sample_sheet_file) base_data_dir = path.join(sample_sheet_dir, "Data", "Intensities", "BaseCalls") project_dir_list = next(walk(base_data_dir))[ 1] # Get the list of project directories that contain sample files for sample in sample_list: project_directory = sample.get('sample_project') if project_directory not in project_dir_list: # The project number in the sample sheet does not match a project folder in the run directory raise exceptions.SequenceFileError( "The uploader was unable to find the directory '{}' in '{}'. " "Please verify your SampleSheet Sample_Project matches the directory structure" .format(project_directory, base_data_dir)) project_data_dir = path.join(base_data_dir, project_directory) # Create a file list of the data directory, only hit the os once data_dir_file_list = next(walk(project_data_dir))[2] properties_dict = _parse_out_sequence_file(sample) file_pattern = "{sample_name}_S(\\S+)_R(\\d+)_(\\S*)\\.fastq.*$".format( sample_name=re.escape(sample.sample_name)) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, data_dir_file_list)) if not pf_list: # we didn't find anything raise exceptions.SequenceFileError(( "The uploader was unable to find an files with a file name that ends with " ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " ).format(sample.sample_name, project_data_dir)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError(( "The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file" ).format(pf_list, project_data_dir)) # Add the dir to each file to create the full path for i in range(len(pf_list)): pf_list[i] = path.join(project_data_dir, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) return sample_list
def build_sample_list_from_sample_sheet_with_abs_path(sample_sheet_file): """ Create a list of Sample objects, where each SequenceFile object has an absolute file path :param sample_sheet_file: :return: """ sample_list = _parse_samples(sample_sheet_file) # Data directory is used if file names on sample sheet are not absolute paths (in directory files) data_dir = path.dirname(sample_sheet_file) sample_sheet_dir_file_list = common.get_file_list(data_dir) for sample in sample_list: sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # create file list of full paths file_list = [] # If file is not an abspath already, make it an abspath from filename + data dir if path.isabs(sample_dict['File_Forward']): file_list.append(sample_dict['File_Forward']) elif sample_dict['File_Forward'] in sample_sheet_dir_file_list: sample_dict['File_Forward'] = path.join( path.abspath(data_dir), sample_dict['File_Forward']) file_list.append(sample_dict['File_Forward']) else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Forward'], data_dir)), sample_sheet_file) # reverse file is same as for forward file if paired_end_read: if path.isabs(sample_dict['File_Reverse']): file_list.append(sample_dict['File_Reverse']) elif sample_dict['File_Reverse'] in sample_sheet_dir_file_list: sample_dict['File_Reverse'] = path.join( path.abspath(data_dir), sample_dict['File_Reverse']) file_list.append(sample_dict['File_Reverse']) else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Reverse'], data_dir)), sample_sheet_file) # Create sequence file object and attach to sample sq = model.SequenceFile(file_list=file_list) sample.sequence_file = deepcopy(sq) return sample_list
def test_send_and_get_sequence_files(self): """ Tests sending and receiving sequence files :return: """ # upload a project project_name = "test_project_2" project_description = "test_project_description" project = model.Project(name=project_name, description=project_description) proj_json_res = self.test_api.send_project(project) project_identifier = proj_json_res['resource']['identifier'] # upload a sample sample_name = "test_sample" sample_desc = "test_sample_desc" sample = model.Sample(sample_name, sample_desc) self.test_api.send_sample(sample, project_identifier) # upload sequence files sequence_file_list = [ path.join(path_to_module, "fake_dir_data", "file_1.fastq.gz"), path.join(path_to_module, "fake_dir_data", "file_2.fastq.gz") ] sequence_file = model.SequenceFile(sequence_file_list) upload_id = self.test_api.create_seq_run({'layoutType': 'PAIRED_END'}, 'miseq') self.test_api.send_sequence_files(sequence_file, sample_name, project_identifier, upload_id) # verify sequence files match what we sent to IRIDA returned_sequence_files = self.test_api.get_sequence_files( project_identifier, sample_name) self.assertEqual(returned_sequence_files[0]['fileName'], 'file_1.fastq.gz') self.assertEqual(returned_sequence_files[1]['fileName'], 'file_2.fastq.gz')
def parse_sample_list(sample_sheet_file, run_data_directory_file_list): """ Creates a list of all sample data in the sample_sheet_file Verifies data is valid for uploading :param sample_sheet_file: :param run_data_directory_file_list: list of all files :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) data_dir = path.dirname(sample_sheet_file) data_dir_file_list_full_path = [] for file_name in run_data_directory_file_list: data_dir_file_list_full_path.append( path.join(path.abspath(data_dir), file_name)) has_paired_end_read = False has_single_end_read = False logging.info( "Verifying data parsed from sample sheet {}".format(sample_sheet_file)) for sample in sample_list: sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # keep track if we have both paired and single end reads if paired_end_read: has_paired_end_read = True else: has_single_end_read = True # Check if file names are in the files we found in the directory if ((sample_dict['File_Forward'] not in run_data_directory_file_list) and (sample_dict['File_Forward'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Forward'], data_dir)), sample_sheet_file) if ((paired_end_read and sample_dict['File_Reverse'] not in run_data_directory_file_list) and (paired_end_read and sample_dict['File_Reverse'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Reverse'], data_dir)), sample_sheet_file) # create file list of full paths file_list = [] # Add the dir to each file to create the full path if sample_dict['File_Forward'] not in data_dir_file_list_full_path: sample_dict['File_Forward'] = path.join( data_dir, sample_dict['File_Forward']) file_list.append(sample_dict['File_Forward']) if paired_end_read and sample_dict[ 'File_Reverse'] not in data_dir_file_list_full_path: sample_dict['File_Reverse'] = path.join( data_dir, sample_dict['File_Reverse']) file_list.append(sample_dict['File_Reverse']) # Create sequence file object and attach to sample sq = model.SequenceFile(file_list=file_list) sample.sequence_file = deepcopy(sq) # Verify we don't have both single end and paired end reads if has_single_end_read and has_paired_end_read: raise exceptions.SampleSheetError( ("Your sample sheet is malformed. " "SampleSheet cannot have both paired end and single end runs. " "Make sure all samples are either paired or single."), sample_sheet_file) return sample_list