Example #1
0
    def show(self, fmt=None):
        """
        Construct and return sample sheet contents

        """
        output = SampleSheet.show(self, fmt=fmt)
        if self.pad:
            ncols = len(self.data.header())
            padded_output = []
            for line in output.split('\n'):
                ncols_in_line = len(line.split(','))
                if ncols_in_line < ncols:
                    line = line + ',' * (ncols - ncols_in_line - 1)
                padded_output.append(line)
            output = '\n'.join(padded_output)
        return output
Example #2
0
    def show(self,fmt=None):
        """
        Construct and return sample sheet contents

        """
        output = SampleSheet.show(self,fmt=fmt)
        if self.pad:
            ncols = len(self.data.header())
            padded_output = []
            for line in output.split('\n'):
                ncols_in_line = len(line.split(','))
                if ncols_in_line < ncols:
                    line = line + ','*(ncols-ncols_in_line-1)
                padded_output.append(line)
            output = '\n'.join(padded_output)
        return output
class SampleSheetLinter(SampleSheetPredictor):
    """
    Class for checking sample sheets for problems

    Provides the following methods for checking different aspects
    of a sample sheet:

    - close_project_names: check if sample sheet projects look similar
    - samples_with_multiple_barcodes: check for samples with multiple
      barcodes
    - samples_in_multiple_projects: check for samples assigned to
      multiple projects
    - has_invalid_lines: check for invalid sample sheet lines
    - has_invalid_characters: check if sample sheet contains invalid
      characters

    Example usage:

    Initialise linter:
    >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt")

    Get closely-matching names:
    >>> linter.close_project_names()
    ...

    """
    def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self,
                                      sample_sheet=self._sample_sheet)


    def walk(self):
        """
        Traverse the list of projects and samples

        Generator that yields tuples consisting of
        (SampleSheetProject,SampleSheetSample) pairs
        
        Yields:
          Tuple: SampleSheetProject, SampleSheetSample pair

        """
        for project in [self.get_project(name)
                        for name in self.project_names]:
            for sample in [project.get_sample(idx)
                           for idx in project.sample_ids]:
                yield (project,sample)
        
    def close_project_names(self):
        """
        Return list of closely-matching project names

        Returns:
          Dictionary: keys are project names which have at least one
            close match; the values for each key are lists with the
            project names which are close matches.

        """
        return get_close_names(self.project_names)

    def samples_with_multiple_barcodes(self):
        """
        Return list of samples which have multiple associated barcodes

        Returns:
          Dictionary: keys are sample IDs which have more than one
          associated barcode; the values for each key are lists of
          the associated barcodes.

        """
        # Look for samples with multiple barcodes
        multiple_barcodes = {}
        for project,sample in self.walk():
            if len(sample.barcode_seqs) > 1:
                multiple_barcodes[sample.sample_id] = \
                    [s for s in sample.barcode_seqs]
        return multiple_barcodes

    def samples_in_multiple_projects(self):
        """
        Return list of samples which are in multiple projects

        Returns:
          Dictionary: dictionary with sample IDs which appear in
            multiple projects as keys; the associated values are
            lists with the project names.

        """
        # Look for samples with multiple projects
        samples = {}
        for project,sample in self.walk():
            if sample.sample_id not in samples:
                samples[sample.sample_id] = []
            samples[sample.sample_id].append(project.name)
        multiple_projects = {}
        for sample in samples:
            if len(samples[sample]) > 1:
                multiple_projects[sample] = samples[sample]
        return multiple_projects

    def has_invalid_lines(self):
        """
        Return list of samplesheet lines which are invalid

        Returns:
          List: list of lines which are invalid (i.e. missing
            required data) in the sample sheet.

        """
        # Convience variables
        sample_id = self._sample_sheet.sample_id_column
        sample_name = self._sample_sheet.sample_name_column
        sample_project = self._sample_sheet.sample_project_column
        # Look at first line to see which items have been provided
        line = self._sample_sheet.data[0]
        has_sample_id = line[sample_id] != ''
        has_sample_name = (sample_name is not None) and \
                          (line[sample_name] != '')
        has_project = line[sample_project] != ''
        # Look for invalid data lines
        invalid_lines = []
        for line in self._sample_sheet.data:
            if self._sample_sheet.has_lanes and line['Lane'] == '':
                invalid_lines.append(line)
            elif has_sample_id and line[sample_id] == '':
                invalid_lines.append(line)
            elif has_sample_name and line[sample_name] == '':
                invalid_lines.append(line)
            elif has_project and line[sample_project] == '':
                invalid_lines.append(line)
        return invalid_lines

    def has_invalid_characters(self):
        """
        Check if text file contains any 'invalid' characters

        In this context a character is 'invalid' if:
        - it is non-ASCII (decimal code > 127), or
        - it is a non-printing ASCII character (code < 32)

        Returns:
          Boolean: True if file contains at least one invalid
            character, False if all characters are valid.

        """
        return has_invalid_characters(text=self._sample_sheet.show())
Example #4
0
class SampleSheetLinter(SampleSheetPredictor):
    """
    Class for checking sample sheets for problems

    Provides the following methods for checking different aspects
    of a sample sheet:

    - close_project_names: check if sample sheet projects look similar
    - samples_with_multiple_barcodes: check for samples with multiple
      barcodes
    - samples_in_multiple_projects: check for samples assigned to
      multiple projects
    - has_invalid_lines: check for invalid sample sheet lines
    - has_invalid_characters: check if sample sheet contains invalid
      characters

    Example usage:

    Initialise linter:
    >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt")

    Get closely-matching names:
    >>> linter.close_project_names()
    ...

    """
    def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet)

    def walk(self):
        """
        Traverse the list of projects and samples

        Generator that yields tuples consisting of
        (SampleSheetProject,SampleSheetSample) pairs
        
        Yields:
          Tuple: SampleSheetProject, SampleSheetSample pair

        """
        for project in [self.get_project(name) for name in self.project_names]:
            for sample in [
                    project.get_sample(idx) for idx in project.sample_ids
            ]:
                yield (project, sample)

    def close_project_names(self):
        """
        Return list of closely-matching project names

        Returns:
          Dictionary: keys are project names which have at least one
            close match; the values for each key are lists with the
            project names which are close matches.

        """
        return get_close_names(self.project_names)

    def samples_with_multiple_barcodes(self):
        """
        Return list of samples which have multiple associated barcodes

        Returns:
          Dictionary: keys are sample IDs which have more than one
          associated barcode; the values for each key are lists of
          the associated barcodes.

        """
        # Look for samples with multiple barcodes
        multiple_barcodes = {}
        for project, sample in self.walk():
            if len(sample.barcode_seqs) > 1:
                multiple_barcodes[sample.sample_id] = \
                    [s for s in sample.barcode_seqs]
        return multiple_barcodes

    def samples_in_multiple_projects(self):
        """
        Return list of samples which are in multiple projects

        Returns:
          Dictionary: dictionary with sample IDs which appear in
            multiple projects as keys; the associated values are
            lists with the project names.

        """
        # Look for samples with multiple projects
        samples = {}
        for project, sample in self.walk():
            if sample.sample_id not in samples:
                samples[sample.sample_id] = []
            samples[sample.sample_id].append(project.name)
        multiple_projects = {}
        for sample in samples:
            if len(samples[sample]) > 1:
                multiple_projects[sample] = samples[sample]
        return multiple_projects

    def has_invalid_lines(self):
        """
        Return list of samplesheet lines which are invalid

        Returns:
          List: list of lines which are invalid (i.e. missing
            required data) in the sample sheet.

        """
        # Convience variables
        sample_id = self._sample_sheet.sample_id_column
        sample_name = self._sample_sheet.sample_name_column
        sample_project = self._sample_sheet.sample_project_column
        # Look at first line to see which items have been provided
        line = self._sample_sheet.data[0]
        has_sample_id = line[sample_id] != ''
        has_sample_name = (sample_name is not None) and \
                          (line[sample_name] != '')
        has_project = line[sample_project] != ''
        # Look for invalid data lines
        invalid_lines = []
        for line in self._sample_sheet.data:
            if self._sample_sheet.has_lanes and line['Lane'] == '':
                invalid_lines.append(line)
            elif has_sample_id and line[sample_id] == '':
                invalid_lines.append(line)
            elif has_sample_name and line[sample_name] == '':
                invalid_lines.append(line)
            elif has_project and line[sample_project] == '':
                invalid_lines.append(line)
        return invalid_lines

    def has_invalid_barcodes(self):
        """
        Return list of lines with invalid barcodes

        Returns:
          List: list of lines which contain invalid barcode
            sequences in the sample sheet.
        """
        invalid_lines = list()
        indices = list()
        for indx in ('index', 'index2'):
            if indx in self._sample_sheet.data.header():
                indices.append(indx)
        if indices:
            for line in self._sample_sheet.data:
                for indx in indices:
                    if not barcode_is_valid(line[indx]):
                        invalid_lines.append(line)
                        continue
        return invalid_lines

    def has_invalid_characters(self):
        """
        Check if text file contains any 'invalid' characters

        In this context a character is 'invalid' if:
        - it is non-ASCII (decimal code > 127), or
        - it is a non-printing ASCII character (code < 32)

        Returns:
          Boolean: True if file contains at least one invalid
            character, False if all characters are valid.

        """
        return has_invalid_characters(text=self._sample_sheet.show())