def test_get_csv_reader_no_sheet(self): """ Make sure an error is raised if the csv reader is not given a valid sample sheet :return: """ sheet_file = path.join(path_to_module, "fake_ngs_data", "Alignment_1") with self.assertRaises(SampleSheetError): common.get_csv_reader(sheet_file)
def test_get_csv_reader_no_sheet(self): """ When no sheet is given to parser, throw error :return: """ sheet_file = os.path.join(path_to_module, "fake_dir_data") with self.assertRaises(SampleSheetError): common.get_csv_reader(sheet_file)
def parse_metadata(sample_sheet_file): """ Parse all lines under [Header], [Reads] and [BCLConvert_Settings] in .csv file Lines under [Reads] are stored in a list with key name "readLengths" All other key names are translated according to the metadata_key_translation_dict arguments: sample_sheet_file -- path to UploadList.csv returns a dictionary containing the parsed key:pair values from .csv file """ metadata_dict = {"readLengths": [], "indexCycles": []} csv_reader = common.get_csv_reader(sample_sheet_file) section = None for line in csv_reader: if "[Header]" in line or "[BCLConvert_Settings]" in line: section = "header" continue elif "[Reads]" in line: section = "reads" continue elif "[BCLConvert_Data]" in line: break elif line and line[0].startswith("["): section = "unknown" continue if not line or not line[0]: continue if not section: logging.debug("Sample sheet is missing important sections: no sections were found") raise exceptions.SampleSheetError("Sample sheet is missing important sections: no sections were found.", sample_sheet_file) elif section == "reads": if line[0] == "Read1Cycles" or line[0] == "Read2Cycles": metadata_dict["readLengths"].append(line[1]) elif line[0] == "Index1Cycles" or line[0] == "Index2Cycles": metadata_dict["indexCycles"].append(line[1]) # currently sends just the larger readLengths if len(metadata_dict["readLengths"]) == 2: metadata_dict["layoutType"] = "PAIRED_END" elif len(metadata_dict["readLengths"]) == 1: metadata_dict["layoutType"] = "SINGLE_END" else: logging.debug("The sample sheet has invalid [Reads] sections: [Reads] section should have 2 or 4 entries.") raise exceptions.SampleSheetError("The sample sheet has invalid [Reads] sections: " "[Reads] section should have 2 or 4 entries.", sample_sheet_file) metadata_dict["readLengths"] = max(metadata_dict["readLengths"]) metadata_dict["indexCycles"] = max(metadata_dict["indexCycles"]) return metadata_dict
def test_get_csv_reader_valid_sheet(self): """ Given a valid sample sheet, ensure the parsed sheet matches expected output :return: """ sheet_file = os.path.join(path_to_module, "test_csv_reader.csv") lines = common.get_csv_reader(sheet_file) # This is a sample of what the miseq sample sheet looks like, but it also makes a good # example for what we want our csv reader to be able to parse. correct_lines = [ ['[Header]'], ['IEMFileVersion', '4'], ['Investigator Name', 'Some Guy'], ['Experiment Name', '1'], ['Date', '10/15/2013'], ['Workflow', 'GenerateFASTQ'], ['Application', 'FASTQ Only'], ['Assay', 'Nextera XT'], ['Description', 'Superbug'], ['Chemistry', 'Amplicon'], [], ['[Reads]'], ['251'], ['250'], [], ['[Settings]'], ['ReverseComplement', '0'], ['Adapter', 'AAAAGGGGAAAAGGGGAAA'], [], ['[Data]'], ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well', 'I7_Index_ID', 'index', 'I5_Index_ID', 'index2', 'Sample_Project', 'Description'], ['01-1111', '01-1111', '1', '01', 'N01', 'AAAAAAAA', 'S01', 'TTTTTTTT', '6', 'Super bug '], ['02-2222', '02-2222', '2', '02', 'N02', 'GGGGGGGG', 'S02', 'CCCCCCCC', '6', 'Scary bug '], ['03-3333', '03-3333', '3', '03', 'N03', 'CCCCCCCC', 'S03', 'GGGGGGGG', '6', 'Deadly bug '] ] for line, c_line in zip(lines, correct_lines): self.assertEqual(line, c_line)
def test_get_csv_reader_valid_sheet(self): """ Given a valid sample sheet, ensure the parsed sheet matches expected output :return: """ sheet_file = path.join(path_to_module, "fake_ngs_data", "SampleSheet.csv") lines = common.get_csv_reader(sheet_file) correct_lines = [['[Header]'], ['Local Run Manager Analysis Id', '4004'], ['Experiment Name', '1'], ['Date', '10/15/2013'], ['Workflow', 'GenerateFastQWorkflow'], ['Description', 'Superbug'], ['Chemistry', 'Amplicon'], [], ['[Reads]'], ['151'], ['151'], [], ['[Settings]'], ['Adapter', 'AAAAGGGGAAAAGGGGAAA'], [], ['[Data]'], [ 'Sample_ID', 'Sample_Name', 'index', 'I7_Index_ID', 'index2', 'I5_Index_ID', 'Sample_Project' ], [ '01-1111-4004', '01-1111', 'AAAAAAAA', 'N01', 'TTTTTTTT', 'S01', '6' ], [ '02-2222-4004', '02-2222', 'GGGGGGGG', 'N02', 'CCCCCCCC', 'S02', '6' ], [ '03-3333-4004', '03-3333', 'CCCCCCCC', 'N03', 'GGGGGGGG', 'S03', '6' ]] for line, c_line in zip(lines, correct_lines): self.assertEqual(line, c_line)
def _parse_samples(sample_sheet_file): """ Parse all the lines under "[Data]" in .csv file Keys in sample_key_translation_dict have their values changed for uploading to REST API All other keys keep the same name that they have in .csv file arguments: sample_sheet_file -- path to UploadList.csv returns a list containing Sample objects that have been created by a dictionary from the parsed out key:pair values from .csv file """ logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() sample_list = [] sample_key_translation_dict = { 'Sample_ID': 'sampleName', 'Sample_Project': 'sample_project' } _parse_samples.sample_key_translation_dict = sample_key_translation_dict # initialize dictionary keys from first line (data headers/attributes) set_attributes = False for line in csv_reader: if set_attributes: for item in line: if item in sample_key_translation_dict: key_name = sample_key_translation_dict[item] else: key_name = item sample_dict[key_name] = "" break if "[BCLConvert_Data]" in line: set_attributes = True # fill in values for keys. line is currently below the [Data] headers for sample_number, line in enumerate(csv_reader): if len(sample_dict.keys()) != len(line): """ if there is one more Data header compared to the length of data values then add an empty string to the end of data values i.e the Description will be empty string assumes the last Data header is going to be the Description this handles the case where the last trailing comma is trimmed Shaun said this issue may come up when a user edits the SampleSheet from within the MiSeq software """ if len(sample_dict.keys()) - len(line) == 1: line.append("") else: raise exceptions.SampleSheetError( ("Your sample sheet is malformed. Expected to find {} " "columns in the [Data] section, but only found {} columns " "for line {}.".format(len(sample_dict.keys()), len(line), line)), sample_sheet_file ) for index, key in enumerate(sample_dict.keys()): sample_dict[key] = line[index].strip() # assumes values are never empty new_sample_dict = deepcopy(sample_dict) new_sample_name = new_sample_dict['sampleName'] del new_sample_dict['sampleName'] sample = model.Sample( sample_name=new_sample_name, description="", sample_number=sample_number + 1, samp_dict=new_sample_dict) sample_list.append(sample) return sample_list
def parse_metadata(sample_sheet_file): """ Parse all lines under [Header], [Reads] and [Settings] in .csv file Lines under [Reads] are stored in a list with key name "readLengths" All other key names are translated according to the metadata_key_translation_dict arguments: sample_sheet_file -- path to SampleSheet.csv returns a dictionary containing the parsed key:pair values from .csv file """ metadata_dict = {"readLengths": []} csv_reader = common.get_csv_reader(sample_sheet_file) metadata_key_translation_dict = { 'Assay': 'assay', 'Description': 'description', 'Application': 'application', 'Investigator Name': 'investigatorName', 'Adapter': 'adapter', 'AdapterRead2': 'adapterread2', 'Workflow': 'workflow', 'ReverseComplement': 'reversecomplement', 'IEMFileVersion': 'iemfileversion', 'Date': 'date', 'Experiment Name': 'experimentName', 'Chemistry': 'chemistry', 'Project Name': 'projectName' } section = None for line in csv_reader: if "[Header]" in line or "[Settings]" in line: section = "header" continue elif "[Reads]" in line: section = "reads" continue elif "[Data]" in line: break elif line and line[0].startswith("["): section = "unknown" continue if not line or not line[0]: continue if not section: logging.debug( "Sample sheet is missing important sections: no sections were found" ) raise exceptions.SampleSheetError( "Sample sheet is missing important sections: no sections were found.", sample_sheet_file) elif section == "header": try: key_name = metadata_key_translation_dict[line[0]] metadata_dict[key_name] = line[1] except KeyError: logging.debug("Unexpected key in header: [{}]".format(line[0])) elif section == "reads": metadata_dict["readLengths"].append(line[0]) # currently sends just the larger readLengths if len(metadata_dict["readLengths"]) > 0: if len(metadata_dict["readLengths"]) == 2: metadata_dict["layoutType"] = "PAIRED_END" else: metadata_dict["layoutType"] = "SINGLE_END" metadata_dict["readLengths"] = max(metadata_dict["readLengths"]) else: # this is an exceptional case, you can't have no read lengths! logging.debug( "The sample sheet is missing important sections: no [Reads] section found." ) raise exceptions.SampleSheetError( "The sample sheet is missing important sections: no [Reads] section found.", sample_sheet_file) return metadata_dict
def _parse_samples(sample_sheet_file): """ Parse all the lines under "[Data]" in .csv file arguments: sample_sheet_file -- path to SampleSheet.csv returns a list containing Sample objects that have been created by a dictionary from the parsed out key:pair values from .csv file """ logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() sample_list = [] sample_key_list = [ 'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse' ] # initialize dictionary keys from first line (data headers/attributes) set_attributes = False for line in csv_reader: if set_attributes: for item in line: if item in sample_key_list: key_name = item sample_dict[key_name] = "" break if "[Data]" in line: set_attributes = True # fill in values for keys. line is currently below the [Data] headers for sample_number, line in enumerate(csv_reader): # if the line is empty (like a blank line at the end of the file) continue if not line: continue if len(sample_dict.keys()) != len(line): """ if there is one more Data header compared to the length of data values then add an empty string to the end of data values i.e the File_Reverse will be empty string assumes the last Data header is going to be the File_Reverse this handles the case where the last trailing comma is trimmed when doing a single end run """ if len(sample_dict.keys()) - len(line) == 1: line.append("") else: raise exceptions.SampleSheetError(( "Your sample sheet is malformed. Expected to find {} " "columns in the [Data] section, but only found {} columns " "for line {}.".format(len(sample_dict.keys()), len(line), line)), sample_sheet_file) for index, key in enumerate(sample_dict.keys()): value = line[index].strip() # Keys other than 'File_Reverse' cannot be empty if len(value) is 0: # no value if key != 'File_Reverse': raise exceptions.SampleSheetError(( "Your sample sheet is malformed. {} in the [Data] section cannot be empty." "".format(key)), sample_sheet_file) sample_dict[key] = value sample_key_list = [ 'Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse' ] new_sample_dict = deepcopy(sample_dict) new_sample_name = new_sample_dict['Sample_Name'] new_sample_project = new_sample_dict['Project_ID'] new_sample_dict['sample_project'] = new_sample_project del new_sample_dict['Sample_Name'] del new_sample_dict['Project_ID'] sample = model.Sample(sample_name=new_sample_name, description="", sample_number=sample_number + 1, samp_dict=new_sample_dict) sample_list.append(sample) return sample_list
def validate_sample_sheet(sample_sheet_file): """ Checks if the given sample_sheet_file can be parsed Requires [Header] because it contains Workflow Requires [Data] for creating Sample objects and requires Sample_ID, Sample_Name, Sample_Project and Description table headers arguments: sample_sheet_file -- path to SampleSheet.csv returns ValidationResult object - stores list of string error messages """ csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() all_data_headers_found = False data_sect_found = False check_data_headers = False # status of required data headers found_data_headers = { "Sample_Name": False, "Project_ID": False, "File_Forward": False, "File_Reverse": False } for line in csv_reader: if "[Data]" in line: data_sect_found = True check_data_headers = True # next line contains data headers elif check_data_headers: for data_header in found_data_headers.keys(): if data_header in line: found_data_headers[data_header] = True # if all required dataHeaders are found if all(found_data_headers.values()): all_data_headers_found = True check_data_headers = False if not all([data_sect_found, all_data_headers_found]): if data_sect_found is False: v_res.add_error( exceptions.SampleSheetError( "[Data] section not found in SampleSheet", sample_sheet_file)) if all_data_headers_found is False: missing_str = "" for data_header in found_data_headers: if found_data_headers[data_header] is False: missing_str = missing_str + data_header + ", " missing_str = missing_str[:-2] # remove last ", " v_res.add_error( exceptions.SampleSheetError( "Missing required data header(s): " + missing_str, sample_sheet_file)) return v_res