def show(self, fmt=None): """ Construct and return sample sheet contents """ output = SampleSheet.show(self, fmt=fmt) if self.pad: ncols = len(self.data.header()) padded_output = [] for line in output.split('\n'): ncols_in_line = len(line.split(',')) if ncols_in_line < ncols: line = line + ',' * (ncols - ncols_in_line - 1) padded_output.append(line) output = '\n'.join(padded_output) return output
def show(self,fmt=None): """ Construct and return sample sheet contents """ output = SampleSheet.show(self,fmt=fmt) if self.pad: ncols = len(self.data.header()) padded_output = [] for line in output.split('\n'): ncols_in_line = len(line.split(',')) if ncols_in_line < ncols: line = line + ','*(ncols-ncols_in_line-1) padded_output.append(line) output = '\n'.join(padded_output) return output
class SampleSheetLinter(SampleSheetPredictor): """ Class for checking sample sheets for problems Provides the following methods for checking different aspects of a sample sheet: - close_project_names: check if sample sheet projects look similar - samples_with_multiple_barcodes: check for samples with multiple barcodes - samples_in_multiple_projects: check for samples assigned to multiple projects - has_invalid_lines: check for invalid sample sheet lines - has_invalid_characters: check if sample sheet contains invalid characters Example usage: Initialise linter: >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt") Get closely-matching names: >>> linter.close_project_names() ... """ def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet) def walk(self): """ Traverse the list of projects and samples Generator that yields tuples consisting of (SampleSheetProject,SampleSheetSample) pairs Yields: Tuple: SampleSheetProject, SampleSheetSample pair """ for project in [self.get_project(name) for name in self.project_names]: for sample in [project.get_sample(idx) for idx in project.sample_ids]: yield (project,sample) def close_project_names(self): """ Return list of closely-matching project names Returns: Dictionary: keys are project names which have at least one close match; the values for each key are lists with the project names which are close matches. """ return get_close_names(self.project_names) def samples_with_multiple_barcodes(self): """ Return list of samples which have multiple associated barcodes Returns: Dictionary: keys are sample IDs which have more than one associated barcode; the values for each key are lists of the associated barcodes. """ # Look for samples with multiple barcodes multiple_barcodes = {} for project,sample in self.walk(): if len(sample.barcode_seqs) > 1: multiple_barcodes[sample.sample_id] = \ [s for s in sample.barcode_seqs] return multiple_barcodes def samples_in_multiple_projects(self): """ Return list of samples which are in multiple projects Returns: Dictionary: dictionary with sample IDs which appear in multiple projects as keys; the associated values are lists with the project names. """ # Look for samples with multiple projects samples = {} for project,sample in self.walk(): if sample.sample_id not in samples: samples[sample.sample_id] = [] samples[sample.sample_id].append(project.name) multiple_projects = {} for sample in samples: if len(samples[sample]) > 1: multiple_projects[sample] = samples[sample] return multiple_projects def has_invalid_lines(self): """ Return list of samplesheet lines which are invalid Returns: List: list of lines which are invalid (i.e. missing required data) in the sample sheet. """ # Convience variables sample_id = self._sample_sheet.sample_id_column sample_name = self._sample_sheet.sample_name_column sample_project = self._sample_sheet.sample_project_column # Look at first line to see which items have been provided line = self._sample_sheet.data[0] has_sample_id = line[sample_id] != '' has_sample_name = (sample_name is not None) and \ (line[sample_name] != '') has_project = line[sample_project] != '' # Look for invalid data lines invalid_lines = [] for line in self._sample_sheet.data: if self._sample_sheet.has_lanes and line['Lane'] == '': invalid_lines.append(line) elif has_sample_id and line[sample_id] == '': invalid_lines.append(line) elif has_sample_name and line[sample_name] == '': invalid_lines.append(line) elif has_project and line[sample_project] == '': invalid_lines.append(line) return invalid_lines def has_invalid_characters(self): """ Check if text file contains any 'invalid' characters In this context a character is 'invalid' if: - it is non-ASCII (decimal code > 127), or - it is a non-printing ASCII character (code < 32) Returns: Boolean: True if file contains at least one invalid character, False if all characters are valid. """ return has_invalid_characters(text=self._sample_sheet.show())
class SampleSheetLinter(SampleSheetPredictor): """ Class for checking sample sheets for problems Provides the following methods for checking different aspects of a sample sheet: - close_project_names: check if sample sheet projects look similar - samples_with_multiple_barcodes: check for samples with multiple barcodes - samples_in_multiple_projects: check for samples assigned to multiple projects - has_invalid_lines: check for invalid sample sheet lines - has_invalid_characters: check if sample sheet contains invalid characters Example usage: Initialise linter: >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt") Get closely-matching names: >>> linter.close_project_names() ... """ def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None): """ Create a new SampleSheetLinter instance Arguments: sample_sheet (SampleSheet): a SampleSheet instance to use for prediction (if None then must provide a file via the `sample_sheet_file` argument; if both are provided then `sample_sheet` takes precedence) sample_sheet_file (str): path to a sample sheet file, if `sample_sheet` argument is None fp (File): File-like object opened for reading; if this is not None then the SampleSheet object will be populated from this in preference to `sample_sheet` """ # Initialise self._fp = fp self._sample_sheet_file = sample_sheet_file self._sample_sheet = sample_sheet if self._fp is None: if self._sample_sheet is None: self._sample_sheet = SampleSheet(sample_sheet_file) else: self._sample_sheet = SampleSheet(fp=self._fp) SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet) def walk(self): """ Traverse the list of projects and samples Generator that yields tuples consisting of (SampleSheetProject,SampleSheetSample) pairs Yields: Tuple: SampleSheetProject, SampleSheetSample pair """ for project in [self.get_project(name) for name in self.project_names]: for sample in [ project.get_sample(idx) for idx in project.sample_ids ]: yield (project, sample) def close_project_names(self): """ Return list of closely-matching project names Returns: Dictionary: keys are project names which have at least one close match; the values for each key are lists with the project names which are close matches. """ return get_close_names(self.project_names) def samples_with_multiple_barcodes(self): """ Return list of samples which have multiple associated barcodes Returns: Dictionary: keys are sample IDs which have more than one associated barcode; the values for each key are lists of the associated barcodes. """ # Look for samples with multiple barcodes multiple_barcodes = {} for project, sample in self.walk(): if len(sample.barcode_seqs) > 1: multiple_barcodes[sample.sample_id] = \ [s for s in sample.barcode_seqs] return multiple_barcodes def samples_in_multiple_projects(self): """ Return list of samples which are in multiple projects Returns: Dictionary: dictionary with sample IDs which appear in multiple projects as keys; the associated values are lists with the project names. """ # Look for samples with multiple projects samples = {} for project, sample in self.walk(): if sample.sample_id not in samples: samples[sample.sample_id] = [] samples[sample.sample_id].append(project.name) multiple_projects = {} for sample in samples: if len(samples[sample]) > 1: multiple_projects[sample] = samples[sample] return multiple_projects def has_invalid_lines(self): """ Return list of samplesheet lines which are invalid Returns: List: list of lines which are invalid (i.e. missing required data) in the sample sheet. """ # Convience variables sample_id = self._sample_sheet.sample_id_column sample_name = self._sample_sheet.sample_name_column sample_project = self._sample_sheet.sample_project_column # Look at first line to see which items have been provided line = self._sample_sheet.data[0] has_sample_id = line[sample_id] != '' has_sample_name = (sample_name is not None) and \ (line[sample_name] != '') has_project = line[sample_project] != '' # Look for invalid data lines invalid_lines = [] for line in self._sample_sheet.data: if self._sample_sheet.has_lanes and line['Lane'] == '': invalid_lines.append(line) elif has_sample_id and line[sample_id] == '': invalid_lines.append(line) elif has_sample_name and line[sample_name] == '': invalid_lines.append(line) elif has_project and line[sample_project] == '': invalid_lines.append(line) return invalid_lines def has_invalid_barcodes(self): """ Return list of lines with invalid barcodes Returns: List: list of lines which contain invalid barcode sequences in the sample sheet. """ invalid_lines = list() indices = list() for indx in ('index', 'index2'): if indx in self._sample_sheet.data.header(): indices.append(indx) if indices: for line in self._sample_sheet.data: for indx in indices: if not barcode_is_valid(line[indx]): invalid_lines.append(line) continue return invalid_lines def has_invalid_characters(self): """ Check if text file contains any 'invalid' characters In this context a character is 'invalid' if: - it is non-ASCII (decimal code > 127), or - it is a non-printing ASCII character (code < 32) Returns: Boolean: True if file contains at least one invalid character, False if all characters are valid. """ return has_invalid_characters(text=self._sample_sheet.show())