def verify_sample_sheet_file_names_in_file_list(sample_sheet_file, run_data_directory_file_list): """ Given a sample sheet, and a list of files in a directory, verify that all the files on the sheet exist in the file list If a file is missing, a SampleSheetError is raised :param sample_sheet_file: :param run_data_directory_file_list: :return: """ sample_list = _parse_samples(sample_sheet_file) for sample in sample_list: sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # Check if file names are in the files we found in the directory file list if sample_dict['File_Forward'] not in run_data_directory_file_list: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file list in sample sheet file" "".format(sample_dict['File_Forward'])), sample_sheet_file) if paired_end_read and sample_dict[ 'File_Reverse'] not in run_data_directory_file_list: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file list in sample sheet file" "".format(sample_dict['File_Reverse'])), sample_sheet_file)
def parse_metadata(sample_sheet_file): """ Parse all lines under [Header], [Reads] and [BCLConvert_Settings] in .csv file Lines under [Reads] are stored in a list with key name "readLengths" All other key names are translated according to the metadata_key_translation_dict arguments: sample_sheet_file -- path to UploadList.csv returns a dictionary containing the parsed key:pair values from .csv file """ metadata_dict = {"readLengths": [], "indexCycles": []} csv_reader = common.get_csv_reader(sample_sheet_file) section = None for line in csv_reader: if "[Header]" in line or "[BCLConvert_Settings]" in line: section = "header" continue elif "[Reads]" in line: section = "reads" continue elif "[BCLConvert_Data]" in line: break elif line and line[0].startswith("["): section = "unknown" continue if not line or not line[0]: continue if not section: logging.debug("Sample sheet is missing important sections: no sections were found") raise exceptions.SampleSheetError("Sample sheet is missing important sections: no sections were found.", sample_sheet_file) elif section == "reads": if line[0] == "Read1Cycles" or line[0] == "Read2Cycles": metadata_dict["readLengths"].append(line[1]) elif line[0] == "Index1Cycles" or line[0] == "Index2Cycles": metadata_dict["indexCycles"].append(line[1]) # currently sends just the larger readLengths if len(metadata_dict["readLengths"]) == 2: metadata_dict["layoutType"] = "PAIRED_END" elif len(metadata_dict["readLengths"]) == 1: metadata_dict["layoutType"] = "SINGLE_END" else: logging.debug("The sample sheet has invalid [Reads] sections: [Reads] section should have 2 or 4 entries.") raise exceptions.SampleSheetError("The sample sheet has invalid [Reads] sections: " "[Reads] section should have 2 or 4 entries.", sample_sheet_file) metadata_dict["readLengths"] = max(metadata_dict["readLengths"]) metadata_dict["indexCycles"] = max(metadata_dict["indexCycles"]) return metadata_dict
def build_sample_list_from_sample_sheet_with_abs_path(sample_sheet_file): """ Create a list of Sample objects, where each SequenceFile object has an absolute file path :param sample_sheet_file: :return: """ sample_list = _parse_samples(sample_sheet_file) # Data directory is used if file names on sample sheet are not absolute paths (in directory files) data_dir = path.dirname(sample_sheet_file) sample_sheet_dir_file_list = common.get_file_list(data_dir) for sample in sample_list: sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # create file list of full paths file_list = [] # If file is not an abspath already, make it an abspath from filename + data dir if path.isabs(sample_dict['File_Forward']): file_list.append(sample_dict['File_Forward']) elif sample_dict['File_Forward'] in sample_sheet_dir_file_list: sample_dict['File_Forward'] = path.join( path.abspath(data_dir), sample_dict['File_Forward']) file_list.append(sample_dict['File_Forward']) else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Forward'], data_dir)), sample_sheet_file) # reverse file is same as for forward file if paired_end_read: if path.isabs(sample_dict['File_Reverse']): file_list.append(sample_dict['File_Reverse']) elif sample_dict['File_Reverse'] in sample_sheet_dir_file_list: sample_dict['File_Reverse'] = path.join( path.abspath(data_dir), sample_dict['File_Reverse']) file_list.append(sample_dict['File_Reverse']) else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Reverse'], data_dir)), sample_sheet_file) # Create sequence file object and attach to sample sq = model.SequenceFile(file_list=file_list) sample.sequence_file = deepcopy(sq) return sample_list
def get_csv_reader(sample_sheet_file): """ tries to create a csv.reader object which will be used to parse through the lines in SampleSheet.csv raises an error if: sample_sheet_file is not an existing file sample_sheet_file contains null byte(s) arguments: data_dir -- the directory that has SampleSheet.csv in it returns a csv.reader object """ if os.path.isfile(sample_sheet_file): csv_file = open(sample_sheet_file, "r") # strip any trailing newline characters from the end of the line # including Windows newline characters (\r\n) csv_lines = [x.rstrip('\n') for x in csv_file] csv_lines = [x.rstrip('\r') for x in csv_lines] # open and read file in binary then send it to be parsed by csv's reader csv_reader = reader(csv_lines) else: raise exceptions.SampleSheetError( "Sample sheet cannot be parsed as a CSV file because it's not a regular file.", sample_sheet_file) return csv_reader
def _parse_samples(sample_sheet_file): """ Parse all the lines under "[Data]" in .csv file Keys in sample_key_translation_dict have their values changed for uploading to REST API All other keys keep the same name that they have in .csv file arguments: sample_sheet_file -- path to UploadList.csv returns a list containing Sample objects that have been created by a dictionary from the parsed out key:pair values from .csv file """ logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() sample_list = [] sample_key_translation_dict = { 'Sample_ID': 'sampleName', 'Sample_Project': 'sample_project' } _parse_samples.sample_key_translation_dict = sample_key_translation_dict # initialize dictionary keys from first line (data headers/attributes) set_attributes = False for line in csv_reader: if set_attributes: for item in line: if item in sample_key_translation_dict: key_name = sample_key_translation_dict[item] else: key_name = item sample_dict[key_name] = "" break if "[BCLConvert_Data]" in line: set_attributes = True # fill in values for keys. line is currently below the [Data] headers for sample_number, line in enumerate(csv_reader): if len(sample_dict.keys()) != len(line): """ if there is one more Data header compared to the length of data values then add an empty string to the end of data values i.e the Description will be empty string assumes the last Data header is going to be the Description this handles the case where the last trailing comma is trimmed Shaun said this issue may come up when a user edits the SampleSheet from within the MiSeq software """ if len(sample_dict.keys()) - len(line) == 1: line.append("") else: raise exceptions.SampleSheetError( ("Your sample sheet is malformed. Expected to find {} " "columns in the [Data] section, but only found {} columns " "for line {}.".format(len(sample_dict.keys()), len(line), line)), sample_sheet_file ) for index, key in enumerate(sample_dict.keys()): sample_dict[key] = line[index].strip() # assumes values are never empty new_sample_dict = deepcopy(sample_dict) new_sample_name = new_sample_dict['sampleName'] del new_sample_dict['sampleName'] sample = model.Sample( sample_name=new_sample_name, description="", sample_number=sample_number + 1, samp_dict=new_sample_dict) sample_list.append(sample) return sample_list
def parse_metadata(sample_sheet_file): """ Parse all lines under [Header], [Reads] and [Settings] in .csv file Lines under [Reads] are stored in a list with key name "readLengths" All other key names are translated according to the metadata_key_translation_dict arguments: sample_sheet_file -- path to SampleSheet.csv returns a dictionary containing the parsed key:pair values from .csv file """ metadata_dict = {"readLengths": []} csv_reader = common.get_csv_reader(sample_sheet_file) metadata_key_translation_dict = { 'Assay': 'assay', 'Description': 'description', 'Application': 'application', 'Investigator Name': 'investigatorName', 'Adapter': 'adapter', 'AdapterRead2': 'adapterread2', 'Workflow': 'workflow', 'ReverseComplement': 'reversecomplement', 'IEMFileVersion': 'iemfileversion', 'Date': 'date', 'Experiment Name': 'experimentName', 'Chemistry': 'chemistry', 'Project Name': 'projectName' } section = None for line in csv_reader: if "[Header]" in line or "[Settings]" in line: section = "header" continue elif "[Reads]" in line: section = "reads" continue elif "[Data]" in line: break elif line and line[0].startswith("["): section = "unknown" continue if not line or not line[0]: continue if not section: logging.debug( "Sample sheet is missing important sections: no sections were found" ) raise exceptions.SampleSheetError( "Sample sheet is missing important sections: no sections were found.", sample_sheet_file) elif section == "header": try: key_name = metadata_key_translation_dict[line[0]] metadata_dict[key_name] = line[1] except KeyError: logging.debug("Unexpected key in header: [{}]".format(line[0])) elif section == "reads": metadata_dict["readLengths"].append(line[0]) # currently sends just the larger readLengths if len(metadata_dict["readLengths"]) > 0: if len(metadata_dict["readLengths"]) == 2: metadata_dict["layoutType"] = "PAIRED_END" else: metadata_dict["layoutType"] = "SINGLE_END" metadata_dict["readLengths"] = max(metadata_dict["readLengths"]) else: # this is an exceptional case, you can't have no read lengths! logging.debug( "The sample sheet is missing important sections: no [Reads] section found." ) raise exceptions.SampleSheetError( "The sample sheet is missing important sections: no [Reads] section found.", sample_sheet_file) return metadata_dict
def parse_sample_list(sample_sheet_file, run_data_directory_file_list): """ Creates a list of all sample data in the sample_sheet_file Verifies data is valid for uploading :param sample_sheet_file: :param run_data_directory_file_list: list of all files :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) data_dir = path.dirname(sample_sheet_file) data_dir_file_list_full_path = [] for file_name in run_data_directory_file_list: data_dir_file_list_full_path.append( path.join(path.abspath(data_dir), file_name)) has_paired_end_read = False has_single_end_read = False logging.info( "Verifying data parsed from sample sheet {}".format(sample_sheet_file)) for sample in sample_list: sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # keep track if we have both paired and single end reads if paired_end_read: has_paired_end_read = True else: has_single_end_read = True # Check if file names are in the files we found in the directory if ((sample_dict['File_Forward'] not in run_data_directory_file_list) and (sample_dict['File_Forward'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Forward'], data_dir)), sample_sheet_file) if ((paired_end_read and sample_dict['File_Reverse'] not in run_data_directory_file_list) and (paired_end_read and sample_dict['File_Reverse'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Reverse'], data_dir)), sample_sheet_file) # create file list of full paths file_list = [] # Add the dir to each file to create the full path if sample_dict['File_Forward'] not in data_dir_file_list_full_path: sample_dict['File_Forward'] = path.join( data_dir, sample_dict['File_Forward']) file_list.append(sample_dict['File_Forward']) if paired_end_read and sample_dict[ 'File_Reverse'] not in data_dir_file_list_full_path: sample_dict['File_Reverse'] = path.join( data_dir, sample_dict['File_Reverse']) file_list.append(sample_dict['File_Reverse']) # Create sequence file object and attach to sample sq = model.SequenceFile(file_list=file_list) sample.sequence_file = deepcopy(sq) # Verify we don't have both single end and paired end reads if has_single_end_read and has_paired_end_read: raise exceptions.SampleSheetError( ("Your sample sheet is malformed. " "SampleSheet cannot have both paired end and single end runs. " "Make sure all samples are either paired or single."), sample_sheet_file) return sample_list
def _parse_samples(sample_sheet_file): """ Parse all the lines under "[Data]" in .csv file arguments: sample_sheet_file -- path to SampleSheet.csv returns a list containing Sample objects that have been created by a dictionary from the parsed out key:pair values from .csv file """ logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() sample_list = [] sample_key_list = [ 'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse' ] # initialize dictionary keys from first line (data headers/attributes) set_attributes = False for line in csv_reader: if set_attributes: for item in line: if item in sample_key_list: key_name = item sample_dict[key_name] = "" break if "[Data]" in line: set_attributes = True # fill in values for keys. line is currently below the [Data] headers for sample_number, line in enumerate(csv_reader): # if the line is empty (like a blank line at the end of the file) continue if not line: continue if len(sample_dict.keys()) != len(line): """ if there is one more Data header compared to the length of data values then add an empty string to the end of data values i.e the File_Reverse will be empty string assumes the last Data header is going to be the File_Reverse this handles the case where the last trailing comma is trimmed when doing a single end run """ if len(sample_dict.keys()) - len(line) == 1: line.append("") else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. Expected to find {} " "columns in the [Data] section, but only found {} columns " "for line {}.".format(len(sample_dict.keys()), len(line), line)), sample_sheet_file) for index, key in enumerate(sample_dict.keys()): value = line[index].strip() # Keys other than 'File_Reverse' cannot be empty if len(value) is 0: # no value if key != 'File_Reverse': raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} in the [Data] section cannot be empty." "".format(key)), sample_sheet_file) sample_dict[key] = value sample_key_list = [ 'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse' ] new_sample_dict = deepcopy(sample_dict) new_sample_name = new_sample_dict['Sample_Name'] new_sample_project = new_sample_dict['Project_ID'] new_sample_dict['sample_project'] = new_sample_project del new_sample_dict['Sample_Name'] del new_sample_dict['Project_ID'] sample = model.Sample(sample_name=new_sample_name, description="", sample_number=sample_number + 1, samp_dict=new_sample_dict) sample_list.append(sample) return sample_list
def get_sequencing_run(self, sample_sheet, run_data_directory_file_list=None): """ Does local validation on the integrity of the run directory / sample sheet Throws a ValidationError with a validation result attached if it cannot make a sequencing run :param sample_sheet: :param run_data_directory_file_list: Optional: List of files in the data directory to verify against the SampleList.csv file. This is used when deploying the parsers on a cloud environment. :return: SequencingRun """ # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) if not validation_result.is_valid(): logging.error("Errors occurred while getting sample sheet") raise exceptions.ValidationError( "Errors occurred while getting sample sheet", validation_result) # When running with a premade file list, verify files on sample_sheet are in file list try: if run_data_directory_file_list is not None: sample_parser.verify_sample_sheet_file_names_in_file_list( sample_sheet, run_data_directory_file_list) except (exceptions.SequenceFileError, exceptions.SampleSheetError) as error: validation_result.add_error(error) logging.error( "Errors occurred while building sequence run from sample sheet" ) raise exceptions.ValidationError( "Errors occurred while building sequence run from sample sheet", validation_result) except Exception as error: validation_result.add_error(error) logging.error("System error while building sequencing run") raise exceptions.ValidationError( "System error while building sequencing run", validation_result) # Build a list of sample objects from sample sheet try: if run_data_directory_file_list is not None: sample_list = sample_parser.build_sample_list_from_sample_sheet_no_verify( sample_sheet) else: sample_list = sample_parser.build_sample_list_from_sample_sheet_with_abs_path( sample_sheet) except (exceptions.DirectoryError, exceptions.SampleSheetError) as error: validation_result.add_error(error) logging.error("Errors occurred while parsing files") raise exceptions.ValidationError( "Errors occurred while parsing files", validation_result) except Exception as error: validation_result.add_error(error) logging.error("System error while parsing files") raise exceptions.ValidationError( "System error while parsing files", validation_result) # verify samples in sample_list are all of one type, either single or paired end if not sample_parser.only_single_or_paired_in_sample_list(sample_list): e = exceptions.SampleSheetError(( "Your sample sheet is malformed. " "SampleSheet cannot have both paired end and single end runs. " "Make sure all samples are either paired or single."), sample_sheet) validation_result.add_error(e) logging.error( "Error occurred while building file list: Sample sheet has both paired and single end reads" ) raise exceptions.ValidationError( "Errors occurred while building file list.", validation_result) # Try to build sequencing run from sample sheet & meta data, raise validation error if errors occur try: run_metadata = sample_parser.parse_metadata(sample_list) sequencing_run = common.build_sequencing_run_from_samples( sample_list, run_metadata, self.get_parser_type_name()) except exceptions.SequenceFileError as error: validation_result.add_error(error) logging.error( "Errors occurred while building sequence run from sample sheet" ) raise exceptions.ValidationError( "Errors occurred while building sequence run from sample sheet", validation_result) return sequencing_run
def validate_sample_sheet(sample_sheet_file): """ Checks if the given sample_sheet_file can be parsed Requires [Header] because it contains Workflow Requires [Data] for creating Sample objects and requires Sample_ID, Sample_Name, Sample_Project and Description table headers arguments: sample_sheet_file -- path to SampleSheet.csv returns ValidationResult object - stores list of string error messages """ csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() all_data_headers_found = False data_sect_found = False check_data_headers = False # status of required data headers found_data_headers = { "Sample_Name": False, "Project_ID": False, "File_Forward": False, "File_Reverse": False } for line in csv_reader: if "[Data]" in line: data_sect_found = True check_data_headers = True # next line contains data headers elif check_data_headers: for data_header in found_data_headers.keys(): if data_header in line: found_data_headers[data_header] = True # if all required dataHeaders are found if all(found_data_headers.values()): all_data_headers_found = True check_data_headers = False if not all([data_sect_found, all_data_headers_found]): if data_sect_found is False: v_res.add_error( exceptions.SampleSheetError( "[Data] section not found in SampleSheet", sample_sheet_file)) if all_data_headers_found is False: missing_str = "" for data_header in found_data_headers: if found_data_headers[data_header] is False: missing_str = missing_str + data_header + ", " missing_str = missing_str[:-2] # remove last ", " v_res.add_error( exceptions.SampleSheetError( "Missing required data header(s): " + missing_str, sample_sheet_file)) return v_res