def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list): """ Creates a list of Sample Objects :param sample_sheet_file: Sample Sheet file :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls) :param run_data_directory_file_list: The list of all files in the data directory :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) for sample in sample_list: properties_dict = _parse_out_sequence_file(sample) # this is the Illumina-defined pattern for naming fastq files, from: # http://blog.basespace.illumina.com/2014/08/18/fastq-upload-in-now-available-in-basespace/ file_pattern = "{sample_name}_S{sample_number}_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format( sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # OK. So we didn't find any files using the **correct** file name # definition according to Illumina. Let's try again with our deprecated # behaviour, where we didn't actually care about the sample number: file_pattern = "{sample_name}_S\\d+_L\\d{{3}}_R(\\d+)_\\S+\\.fastq.*$".format( sample_name=re.escape(sample.sample_name)) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # we **still** didn't find anything. It's pretty likely, then that # there aren't any fastq files in the directory that match what # the sample sheet says... raise exceptions.SequenceFileError( ("The uploader was unable to find an files with a file name that ends with " ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " "This usually happens when the Illumina MiSeq Reporter tool " "does not generate any FastQ data.").format( sample.sample_name, run_data_directory)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError( ("The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file").format( pf_list, run_data_directory)) # Add the dir to each file to create the full path for i in range(len(pf_list)): pf_list[i] = path.join(run_data_directory, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) return sample_list
def _parse_sample_list(sample_sheet_file): """ Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata :param sample_sheet_file: :return: list of samples """ sample_list = _parse_samples(sample_sheet_file) sample_sheet_dir = path.dirname(sample_sheet_file) base_data_dir = path.join(sample_sheet_dir, "Data", "Intensities", "BaseCalls") project_dir_list = next(walk(base_data_dir))[ 1] # Get the list of project directories that contain sample files for sample in sample_list: project_directory = sample.get('sample_project') if project_directory not in project_dir_list: # The project number in the sample sheet does not match a project folder in the run directory raise exceptions.SequenceFileError( "The uploader was unable to find the directory '{}' in '{}'. " "Please verify your SampleSheet Sample_Project matches the directory structure" .format(project_directory, base_data_dir)) project_data_dir = path.join(base_data_dir, project_directory) # Create a file list of the data directory, only hit the os once data_dir_file_list = next(walk(project_data_dir))[2] properties_dict = _parse_out_sequence_file(sample) file_pattern = "{sample_name}_S(\\S+)_R(\\d+)_(\\S*)\\.fastq.*$".format( sample_name=re.escape(sample.sample_name)) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) pf_list = list(filter(regex.search, data_dir_file_list)) if not pf_list: # we didn't find anything raise exceptions.SequenceFileError(( "The uploader was unable to find an files with a file name that ends with " ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " ).format(sample.sample_name, project_data_dir)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError(( "The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file" ).format(pf_list, project_data_dir)) # Add the dir to each file to create the full path for i in range(len(pf_list)): pf_list[i] = path.join(project_data_dir, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) return sample_list