Python SampleSheet Examples, bcftbx.IlluminaData.SampleSheet Python Examples

Example #1

0

Show file

File: samplesheet_utils.py Project: fls-bioinformatics-core/auto_process_ngs

    def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self,
                                      sample_sheet=self._sample_sheet)

Example #2

0

Show file

    def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet)

Example #3

0

Show file

def has_chromium_sc_indices(sample_sheet):
    """
    Check if a sample sheet contains Chromium SC indices

    The Chromium SC indices can be obtained from:

    https://support.10xgenomics.com/permalink/27rGqWvNYYuqkgeS66sksm

    The Chromium SC 3'v2 indices are of the form:

    SI-GA-[A-H][1-12]

    e.g. 'SI-GA-B11'

    Arguments:
      sample_sheet (str): path to the sample sheet CSV
        file to check

    Returns:
      Boolean: True if the sample sheet contains at least
        one Chromium SC index, False if not.
    """
    index_pattern = re.compile(r"SI-GA-[A-H](1[1-2]|[1-9])$")
    s = SampleSheet(sample_sheet)
    for line in s:
        if index_pattern.match(line['index']):
            return True
    return False

Example #4

0

Show file

File: analysis.py Project: fls-bioinformatics-core/auto_process_ngs

    def __init__(self, sample_sheet_file):
        """
        Create a new SampleSheetBarcodes instance

        Arguments:
          sample_sheet_file (str): path of a SampleSheet.csv
            file

        """
        self._sample_sheet = SampleSheet(sample_sheet_file)
        self._sample_lookup = {}
        self._barcode_lookup = {}
        self._lanes = []
        sample_id = self._sample_sheet.sample_id_column
        for line in self._sample_sheet.data:
            if self._sample_sheet.has_lanes:
                lane = line['Lane']
            else:
                lane = None
            if lane not in self._lanes:
                self._lanes.append(lane)
                self._sample_lookup[lane] = {}
                self._barcode_lookup[lane] = {}
            sample = line[sample_id]
            index_seq = samplesheet_index_sequence(line)
            if index_seq is not None:
                index_seq = index_seq.replace('-', '+')
            else:
                index_seq = ""
            self._sample_lookup[lane][index_seq] = sample
            self._barcode_lookup[lane][sample] = index_seq

Example #5

0

Show file

File: pipeline.py Project: fls-bioinformatics-core/auto_process_ngs

 def _check_sample_sheet_indexes(self,sample_sheet_file):
     """
     Check that empty indexes are correctly specified in samplesheet
     """
     # Split sample sheet into sub-sheets by lane
     sample_sheet = SampleSheet(sample_sheet_file)
     if sample_sheet.has_lanes:
         lanes = list(set([line['Lane'] for line in sample_sheet]))
         sample_sheet = [make_custom_sample_sheet(sample_sheet_file,
                                                  lanes=(i,))
                         for i in lanes]
     else:
         sample_sheet = [sample_sheet]
     # Check for empty indexes in each lane
     for s in sample_sheet:
         for line in s:
             if not samplesheet_index_sequence(line):
                 # Lane contains an empty index
                 # Only valid if this is the only line
                 if len(s.data) > 1:
                     if s.has_lanes:
                         raise Exception("Invalid sample sheet: "
                                         "empty index must be the "
                                         "only line for this lane")
                     else:
                         raise Exception("Invalid sample sheet: "
                                         "empty index must be the "
                                         "only line")

Example #6

0

Show file

File: mock.py Project: mmoisse/genomics

    def __init__(self,
                 fmt='IEM',
                 has_lanes=False,
                 dual_index=True,
                 quote_values=False,
                 pad=False):
        """
        Create a new MockSampleSheet instance

        Arguments:
          fmt (str): either 'IEM' or 'CASAVA'
          has_lanes (boolean): if True then the output sample sheet
            will include a 'Lane' field
          dual_index (boolean): if True then IEM-style sample sheet
            will have dual index fields (not relevant for CASAVA-style)
          quote_values (boolean): if True then output data values will
            be surrounded by double quotes (default is not to quote
            values)
          pad (boolean): if True then output sample sheet will have
            additional commas on each line (simulates output from
            Excel) (default is not to pad output)

        """
        # Store argument values
        self._format = fmt
        self._has_lanes = has_lanes
        self._dual_index = dual_index
        # Output formatting
        self.quote_values = quote_values
        self.pad = pad
        # Instantiate the base object
        SampleSheet.__init__(self, fp=cStringIO.StringIO(self._template()))
        # Initialise additional sections for IEM
        if self._format == 'IEM':
            self.set_header(IEMFileVersion=4,
                            Date="11/23/2015",
                            Workflow="GenerateFASTQ",
                            Application="FASTQ Only",
                            Assay="TruSeq HT",
                            Description="",
                            Chemistry="Amplicon")
            self.set_reads(101, 101)
            self.set_settings(ReverseComplement=0,
                              Adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA",
                              AdapterRead2="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA")

Example #7

0

Show file

    def __init__(self,fmt='IEM',has_lanes=False,dual_index=True,
                 quote_values=False,pad=False):
        """
        Create a new MockSampleSheet instance

        Arguments:
          fmt (str): either 'IEM' or 'CASAVA'
          has_lanes (boolean): if True then the output sample sheet
            will include a 'Lane' field
          dual_index (boolean): if True then IEM-style sample sheet
            will have dual index fields (not relevant for CASAVA-style)
          quote_values (boolean): if True then output data values will
            be surrounded by double quotes (default is not to quote
            values)
          pad (boolean): if True then output sample sheet will have
            additional commas on each line (simulates output from
            Excel) (default is not to pad output)

        """
        # Store argument values
        self._format = fmt
        self._has_lanes = has_lanes
        self._dual_index = dual_index
        # Output formatting
        self.quote_values = quote_values
        self.pad = pad
        # Instantiate the base object
        SampleSheet.__init__(self,fp=cStringIO.StringIO(self._template()))
        # Initialise additional sections for IEM
        if self._format == 'IEM':
            self.set_header(IEMFileVersion=4,
                            Date="11/23/2015",
                            Workflow="GenerateFASTQ",
                            Application="FASTQ Only",
                            Assay="TruSeq HT",
                            Description="",
                            Chemistry="Amplicon")
            self.set_reads(101,101)
            self.set_settings(ReverseComplement=0,
                              Adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA",
                              AdapterRead2="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA")

Example #8

0

Show file

File: pipeline.py Project: fls-bioinformatics-core/auto_process_ngs

 def setup(self):
     # Make output filenames
     report_file = os.path.join(self.args.barcode_analysis_dir,
                                'barcodes.report')
     xls_file = os.path.join(self.args.barcode_analysis_dir,
                             'barcodes.xls')
     html_file = os.path.join(self.args.barcode_analysis_dir,
                              'barcodes.html')
     # Remove existing copies, if found
     for filen in (report_file,xls_file,html_file):
         if os.path.exists(filen):
             os.remove(filen)
     # Build command to run the barcode analysis
     cmd = PipelineCommandWrapper(
         "Run analyse_barcodes.py to report barcodes",
         'analyse_barcodes.py',
         '--report',report_file,
         '--xls',xls_file,
         '--html',html_file)
     if self.args.sample_sheet:
         cmd.add_args('--sample-sheet',self.args.sample_sheet)
     if self.args.lanes:
         lanes = self.args.lanes
     elif self.args.sample_sheet:
         # Implicitly get lanes from sample sheet
         try:
             lanes = sorted(
                 set([line['Lane']
                      for line in SampleSheet(self.args.sample_sheet)]))
         except KeyError:
             # No lanes
             lanes = None
     else:
         lanes = None
     if lanes:
         cmd.add_args('--lanes',
                      ','.join([str(l) for l in lanes]))
     if self.args.cutoff:
         cmd.add_args('--cutoff',self.args.cutoff)
     if self.args.mismatches:
         cmd.add_args('--mismatches',self.args.mismatches)
     if self.args.title:
         cmd.add_args('--title',self.args.title)
     cmd.add_args('-c')
     cmd.add_args(*self.args.counts_files)
     self.add_cmd(cmd)
     # Update the output parameters
     self.output.report_file.set(report_file)
     self.output.xls_file.set(xls_file)
     self.output.html_file.set(html_file)

Example #9

0

Show file

    def show(self,fmt=None):
        """
        Construct and return sample sheet contents

        """
        output = SampleSheet.show(self,fmt=fmt)
        if self.pad:
            ncols = len(self.data.header())
            padded_output = []
            for line in output.split('\n'):
                ncols_in_line = len(line.split(','))
                if ncols_in_line < ncols:
                    line = line + ','*(ncols-ncols_in_line-1)
                padded_output.append(line)
            output = '\n'.join(padded_output)
        return output

Example #10

0

Show file

File: mock.py Project: mmoisse/genomics

    def show(self, fmt=None):
        """
        Construct and return sample sheet contents

        """
        output = SampleSheet.show(self, fmt=fmt)
        if self.pad:
            ncols = len(self.data.header())
            padded_output = []
            for line in output.split('\n'):
                ncols_in_line = len(line.split(','))
                if ncols_in_line < ncols:
                    line = line + ',' * (ncols - ncols_in_line - 1)
                padded_output.append(line)
            output = '\n'.join(padded_output)
        return output

Example #11

0

Show file

def has_10x_indices(sample_sheet):
    """
    Check if a sample sheet contains 10xGenomics-format indices

    The Chromium SC 3'v2 indices are of the form:

    SI-GA-[A-H][1-12]

    e.g. 'SI-GA-B11' (see
    https://support.10xgenomics.com/permalink/27rGqWvNYYuqkgeS66sksm)

    For scATAC-seq the indices are assumed to be of the form:

    SI-NA-[A-H][1-12]

    e.g. 'SI-NA-G9'

    For Visium data the indices are assumed to be of the form:

    SI-(TT|TS)-[A-H][1-12]

    e.g. 'SI-TT-B1'

    Arguments:
      sample_sheet (str): path to the sample sheet CSV
        file to check

    Returns:
      Boolean: True if the sample sheet contains at least
        one 10xGenomics-style index, False if not.
    """
    index_pattern = re.compile(r"SI-(GA|NA|TT|TS)-[A-H](1[0-2]|[1-9])$")
    s = SampleSheet(sample_sheet)
    for line in s:
        try:
            if index_pattern.match(line['index']):
                return True
        except KeyError:
            pass
    return False

Example #12

0

Show file

File: utils.py Project: fls-bioinformatics-core/auto_process_ngs

def get_bases_mask_icell8(bases_mask,sample_sheet=None):
    """
    Reset the supplied bases mask string so that only the
    bases containing the inline barcode and UMIs are kept,
    and any remaining bases are ignored.

    If a sample sheet is also supplied then an additional
    update will be made to ensure that the bases mask
    respects the barcode lengths given there.

    Arguments:
      bases_mask (str): initial bases mask string to update
      sample_sheet (str): path to optional sample sheet

    Returns:
      String: updated bases mask string
    """
    # Extract R1 mask
    bases_mask = bases_mask.split(',')
    r1_mask = bases_mask[0]
    # Update to restrict to 21 bases
    num_cycles = int(r1_mask[1:])
    icell8_inline_length = (INLINE_BARCODE_LENGTH + UMI_LENGTH)
    assert(num_cycles >= icell8_inline_length)
    discard_length = (num_cycles - icell8_inline_length)
    r1_mask = "y%d" % icell8_inline_length
    r1_mask += ("n%d" % discard_length if discard_length > 0 else "")
    bases_mask[0] = r1_mask
    # Rebuild full bases mask
    bases_mask = ','.join(bases_mask)
    # Handle sample sheet
    if sample_sheet is not None:
        index_seq = samplesheet_index_sequence(
            SampleSheet(sample_sheet).data[0])
        if index_seq is None:
            index_seq = ""
        bases_mask = fix_bases_mask(bases_mask,index_seq)
    return bases_mask

Example #13

0

Show file

File: pipeline.py Project: fls-bioinformatics-core/auto_process_ngs

    def __init__(self,bcl2fastq_dir=None,sample_sheet=None):
        """
        Create a new AnalyseBarcodes pipeline instance

        At least one of the bcl2fastq output directory
        or sample sheet must be supplied when the
        pipeline is instantiated.

        If the bcl2fastq output directory is supplied
        on initialisation then it must exist and
        already contain output Fastq files.

        It is possible to set the pipeline up before the
        bcl2fastq outputs have been generated, as long
        as the sample sheet is supplied. The bcl2fastq
        output directory must then be supplied as an
        input when the pipeline is executed via the
        'run' method.

        Arguments:
          bcl2fastq_dir (str): path to the directory
            with outputs from bcl2fastq
          sample_sheet (str): path to the sample sheet
            file
        """
        # Initialise the pipeline superclass
        Pipeline.__init__(self,name="Analyse Barcodes")

        # Internal parameters
        self._bcl2fastq_dir = bcl2fastq_dir
        self._sample_sheet = sample_sheet

        # Define parameters
        self.add_param('bcl2fastq_dir',value=self._bcl2fastq_dir,type=str)
        self.add_param('sample_sheet',value=self._sample_sheet,type=str)
        self.add_param('barcode_analysis_dir',type=str)
        self.add_param('counts_dir',type=str)
        self.add_param('title',type=str)
        self.add_param('lanes',type=list)
        self.add_param('bases_mask',type=str)
        self.add_param('mismatches',type=int)
        self.add_param('cutoff',type=float,value=0.001)
        self.add_param('force',type=bool,value=False)

        # Get a list of projects
        if self._bcl2fastq_dir is not None:
            # Load data from bcl2fastq output
            try:
                analysis_dir = os.path.abspath(
                    os.path.dirname(self._bcl2fastq_dir))
                bcl2fastq_dir = os.path.basename(self._bcl2fastq_dir)
                illumina_data = IlluminaData(analysis_dir,
                                             unaligned_dir=bcl2fastq_dir)
            except Exception as ex:
                raise Exception("Unaligned dir '%s' supplied but can't "
                                "load data" % self._bcl2fastq_dir)
            # Get a list of projects
            projects = [p.name for p in illumina_data.projects]
        elif self._sample_sheet is not None:
            # Load data from sample sheet
            try:
                s = SampleSheet(self._sample_sheet)
                # List of unique project names
                projects = list(set(
                    [d[s.sample_project_column]
                     if d[s.sample_project_column]
                     else d[s.sample_id_column]
                     for d in s]))
            except Exception as ex:
                raise Exception("Sample sheet '%s' supplied but can't "
                                "get a list of project names" %
                                self._sample_sheet)
            # Check any empty barcode sequences
            self._check_sample_sheet_indexes(self._sample_sheet)
        else:
            raise Exception("Need to supply either unaligned (bcl2fastq "
                            "output) dir or sample sheet")

        self.report("Expecting projects:")
        for p in projects:
            self.report("- %s" % p)

        ####################
        # Build the pipeline
        ####################

        # Setup barcode analysis and counts directories
        setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs(
            "Setup barcode analysis directory",
            self.params.barcode_analysis_dir,
            self.params.counts_dir,
            force=self.params.force)
        self.add_task(setup_barcode_analysis_dir)

        # Load the data from the unaligned/bcl2fastq output dir
        load_illumina_data = LoadIlluminaData(
            "Load Fastq data for barcode analysis",
            self.params.bcl2fastq_dir)
        self.add_task(load_illumina_data)

        # Generate counts for each project
        count_tasks = []
        for project in projects:
            count_barcodes = CountBarcodes(
                "Count barcodes in '%s'" % project,
                load_illumina_data.output.illumina_data,
                project,
                self.params.counts_dir,
                lanes=self.params.lanes)
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir,
                                    load_illumina_data))
            count_tasks.append(count_barcodes)

        # Get counts for 'undetermined'
        count_barcodes = CountBarcodes(
            "Count barcodes in 'undetermined'",
            load_illumina_data.output.illumina_data,
            "__undetermined__",
            self.params.counts_dir,
            lanes=self.params.lanes,
            use_project_name="undetermined")
        self.add_task(count_barcodes,
                      requires=(setup_barcode_analysis_dir,
                                load_illumina_data))
        count_tasks.append(count_barcodes)

        # List the counts files
        list_counts_files = ListBarcodeCountFiles(
            "Gather the barcode counts files",
            self.params.counts_dir)
        self.add_task(list_counts_files,
                      requires=count_tasks)

        # Analyse counts and report the results
        report_barcodes = ReportBarcodeAnalysis(
            "Report barcode analysis",
            list_counts_files.output.counts_files,
            self.params.barcode_analysis_dir,
            sample_sheet=self.params.sample_sheet,
            lanes=self.params.lanes,
            mismatches=self.params.mismatches,
            cutoff=self.params.cutoff,
            title=self.params.title
        )
        self.add_task(report_barcodes,
                      requires=(list_counts_files,))

        # Add final outputs to the pipeline
        self.add_output('report_file',report_barcodes.output.report_file)
        self.add_output('xls_file',report_barcodes.output.xls_file)
        self.add_output('html_file',report_barcodes.output.html_file)

Example #14

0

Show file

File: stats.py Project: nandr0id/auto_process_ngs

    def report_per_lane_sample_stats(self,out_file=None,fp=None,
                                     samplesheet=None):
        """
        Report of reads per sample in each lane

        Reports the number of reads for each sample in each
        lane plus the total reads for each lane.

        Example output:

        Lane 1
        Total reads = 182851745
        - KatyDobbs/KD-K1      79888058        43.7%
        - KatyDobbs/KD-K3      97854292        53.5%
        - Undetermined_indices/lane1       5109395 2.8%
        ...

        Arguments:
          out_file (str): name of file to write report
            to (used if 'fp' is not supplied)
          fp (File): File-like object open for writing
            (defaults to stdout if 'out_file' also not
            supplied)
          samplesheet (str): optional sample sheet file
            to get additional data from
        """
        # Determine output stream
        if fp is None:
            if out_file is None:
                fpp = sys.stdout
            else:
                fpp = open(out_file,'w')
        else:
            fpp = fp
        # Get data from samplesheet
        expected_samples = {}
        if samplesheet:
            s = SampleSheet(samplesheet)
            ncol = s.sample_id_column
            pcol = s.sample_project_column
            for data in s:
                if s.has_lanes:
                    lanes = ['L%d' % data['Lane']]
                else:
                    lanes = self.lane_names
                sample = {
                    'Project': data[pcol],
                    'Sample': data[ncol],
                }
                for lane in lanes:
                    try:
                        expected_samples[lane].append(sample)
                    except KeyError:
                        expected_samples[lane] = [sample,]
        # Report
        lanes = self.lane_names
        for lane in lanes:
            lane_number = int(lane[1:])
            samples = filter(lambda x:
                             x['Read_number'] == 1
                             and not IlluminaFastq(x['Fastq']).is_index_read
                             and bool(x[lane]),
                             self._stats)
            # Additional samples from samplesheet
            if lane in expected_samples:
                for sample in expected_samples[lane]:
                    found_sample = False
                    for smpl in samples:
                        if smpl['Sample'] == sample['Sample'] and \
                           smpl['Project'] == sample['Project']:
                            found_sample = True
                            break
                    if not found_sample:
                        # Add the expected sample with zero reads
                        # for the lane being examined
                        samples.append(
                            TabDataLine(
                                line="%s\t%s\t0" % (sample['Project'],
                                                    sample['Sample']),
                                column_names=('Project','Sample',lane)))
                # Sort into order
                samples = sorted(samples,
                                 key=lambda x: (x['Project'],x['Sample']))
            try:
                total_reads = sum([int(s[lane]) for s in samples])
            except Exception as ex:
                for s in samples:
                    try:
                        int(s[lane])
                    except ValueError:
                        logging.critical("Bad value for read count in "
                                         "lane %s sample %s: '%s'" %
                                         (lane,s['Sample'],s[lane]))
                raise ex
            fpp.write("\nLane %d\n" % lane_number)
            fpp.write("Total reads = %d\n" % total_reads)
            for sample in samples:
                sample_name = "%s/%s" % (sample['Project'],
                                         sample['Sample'])
                nreads = float(sample[lane])
                if total_reads > 0:
                    frac_reads = "%.1f%%" % (nreads/total_reads*100.0)
                else:
                    frac_reads = "n/a"
                fpp.write("- %s\t%d\t%s\n" % (sample_name,
                                              nreads,
                                              frac_reads))
        # Close file
        if fp is None and out_file is not None:
            fpp.close()

Example #15

0

Show file

class SampleSheetLinter(SampleSheetPredictor):
    """
    Class for checking sample sheets for problems

    Provides the following methods for checking different aspects
    of a sample sheet:

    - close_project_names: check if sample sheet projects look similar
    - samples_with_multiple_barcodes: check for samples with multiple
      barcodes
    - samples_in_multiple_projects: check for samples assigned to
      multiple projects
    - has_invalid_lines: check for invalid sample sheet lines
    - has_invalid_characters: check if sample sheet contains invalid
      characters

    Example usage:

    Initialise linter:
    >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt")

    Get closely-matching names:
    >>> linter.close_project_names()
    ...

    """
    def __init__(self, sample_sheet=None, sample_sheet_file=None, fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self, sample_sheet=self._sample_sheet)

    def walk(self):
        """
        Traverse the list of projects and samples

        Generator that yields tuples consisting of
        (SampleSheetProject,SampleSheetSample) pairs
        
        Yields:
          Tuple: SampleSheetProject, SampleSheetSample pair

        """
        for project in [self.get_project(name) for name in self.project_names]:
            for sample in [
                    project.get_sample(idx) for idx in project.sample_ids
            ]:
                yield (project, sample)

    def close_project_names(self):
        """
        Return list of closely-matching project names

        Returns:
          Dictionary: keys are project names which have at least one
            close match; the values for each key are lists with the
            project names which are close matches.

        """
        return get_close_names(self.project_names)

    def samples_with_multiple_barcodes(self):
        """
        Return list of samples which have multiple associated barcodes

        Returns:
          Dictionary: keys are sample IDs which have more than one
          associated barcode; the values for each key are lists of
          the associated barcodes.

        """
        # Look for samples with multiple barcodes
        multiple_barcodes = {}
        for project, sample in self.walk():
            if len(sample.barcode_seqs) > 1:
                multiple_barcodes[sample.sample_id] = \
                    [s for s in sample.barcode_seqs]
        return multiple_barcodes

    def samples_in_multiple_projects(self):
        """
        Return list of samples which are in multiple projects

        Returns:
          Dictionary: dictionary with sample IDs which appear in
            multiple projects as keys; the associated values are
            lists with the project names.

        """
        # Look for samples with multiple projects
        samples = {}
        for project, sample in self.walk():
            if sample.sample_id not in samples:
                samples[sample.sample_id] = []
            samples[sample.sample_id].append(project.name)
        multiple_projects = {}
        for sample in samples:
            if len(samples[sample]) > 1:
                multiple_projects[sample] = samples[sample]
        return multiple_projects

    def has_invalid_lines(self):
        """
        Return list of samplesheet lines which are invalid

        Returns:
          List: list of lines which are invalid (i.e. missing
            required data) in the sample sheet.

        """
        # Convience variables
        sample_id = self._sample_sheet.sample_id_column
        sample_name = self._sample_sheet.sample_name_column
        sample_project = self._sample_sheet.sample_project_column
        # Look at first line to see which items have been provided
        line = self._sample_sheet.data[0]
        has_sample_id = line[sample_id] != ''
        has_sample_name = (sample_name is not None) and \
                          (line[sample_name] != '')
        has_project = line[sample_project] != ''
        # Look for invalid data lines
        invalid_lines = []
        for line in self._sample_sheet.data:
            if self._sample_sheet.has_lanes and line['Lane'] == '':
                invalid_lines.append(line)
            elif has_sample_id and line[sample_id] == '':
                invalid_lines.append(line)
            elif has_sample_name and line[sample_name] == '':
                invalid_lines.append(line)
            elif has_project and line[sample_project] == '':
                invalid_lines.append(line)
        return invalid_lines

    def has_invalid_barcodes(self):
        """
        Return list of lines with invalid barcodes

        Returns:
          List: list of lines which contain invalid barcode
            sequences in the sample sheet.
        """
        invalid_lines = list()
        indices = list()
        for indx in ('index', 'index2'):
            if indx in self._sample_sheet.data.header():
                indices.append(indx)
        if indices:
            for line in self._sample_sheet.data:
                for indx in indices:
                    if not barcode_is_valid(line[indx]):
                        invalid_lines.append(line)
                        continue
        return invalid_lines

    def has_invalid_characters(self):
        """
        Check if text file contains any 'invalid' characters

        In this context a character is 'invalid' if:
        - it is non-ASCII (decimal code > 127), or
        - it is a non-printing ASCII character (code < 32)

        Returns:
          Boolean: True if file contains at least one invalid
            character, False if all characters are valid.

        """
        return has_invalid_characters(text=self._sample_sheet.show())

Example #16

0

Show file

File: samplesheet_utils.py Project: fls-bioinformatics-core/auto_process_ngs

class SampleSheetLinter(SampleSheetPredictor):
    """
    Class for checking sample sheets for problems

    Provides the following methods for checking different aspects
    of a sample sheet:

    - close_project_names: check if sample sheet projects look similar
    - samples_with_multiple_barcodes: check for samples with multiple
      barcodes
    - samples_in_multiple_projects: check for samples assigned to
      multiple projects
    - has_invalid_lines: check for invalid sample sheet lines
    - has_invalid_characters: check if sample sheet contains invalid
      characters

    Example usage:

    Initialise linter:
    >>> linter = SampleSheetLinter(sample_sheet_file="SampleSheet.txt")

    Get closely-matching names:
    >>> linter.close_project_names()
    ...

    """
    def __init__(self,sample_sheet=None,sample_sheet_file=None,fp=None):
        """
        Create a new SampleSheetLinter instance

        Arguments:
          sample_sheet (SampleSheet): a SampleSheet instance to use
            for prediction (if None then must provide a file via
            the `sample_sheet_file` argument; if both are provided
            then `sample_sheet` takes precedence)
          sample_sheet_file (str): path to a sample sheet file, if
            `sample_sheet` argument is None
          fp (File): File-like object opened for reading; if this
            is not None then the SampleSheet object will be populated
            from this in preference to `sample_sheet`

        """
        # Initialise
        self._fp = fp
        self._sample_sheet_file = sample_sheet_file
        self._sample_sheet = sample_sheet
        if self._fp is None:
            if self._sample_sheet is None:
                self._sample_sheet = SampleSheet(sample_sheet_file)
        else:
            self._sample_sheet = SampleSheet(fp=self._fp)
        SampleSheetPredictor.__init__(self,
                                      sample_sheet=self._sample_sheet)


    def walk(self):
        """
        Traverse the list of projects and samples

        Generator that yields tuples consisting of
        (SampleSheetProject,SampleSheetSample) pairs
        
        Yields:
          Tuple: SampleSheetProject, SampleSheetSample pair

        """
        for project in [self.get_project(name)
                        for name in self.project_names]:
            for sample in [project.get_sample(idx)
                           for idx in project.sample_ids]:
                yield (project,sample)
        
    def close_project_names(self):
        """
        Return list of closely-matching project names

        Returns:
          Dictionary: keys are project names which have at least one
            close match; the values for each key are lists with the
            project names which are close matches.

        """
        return get_close_names(self.project_names)

    def samples_with_multiple_barcodes(self):
        """
        Return list of samples which have multiple associated barcodes

        Returns:
          Dictionary: keys are sample IDs which have more than one
          associated barcode; the values for each key are lists of
          the associated barcodes.

        """
        # Look for samples with multiple barcodes
        multiple_barcodes = {}
        for project,sample in self.walk():
            if len(sample.barcode_seqs) > 1:
                multiple_barcodes[sample.sample_id] = \
                    [s for s in sample.barcode_seqs]
        return multiple_barcodes

    def samples_in_multiple_projects(self):
        """
        Return list of samples which are in multiple projects

        Returns:
          Dictionary: dictionary with sample IDs which appear in
            multiple projects as keys; the associated values are
            lists with the project names.

        """
        # Look for samples with multiple projects
        samples = {}
        for project,sample in self.walk():
            if sample.sample_id not in samples:
                samples[sample.sample_id] = []
            samples[sample.sample_id].append(project.name)
        multiple_projects = {}
        for sample in samples:
            if len(samples[sample]) > 1:
                multiple_projects[sample] = samples[sample]
        return multiple_projects

    def has_invalid_lines(self):
        """
        Return list of samplesheet lines which are invalid

        Returns:
          List: list of lines which are invalid (i.e. missing
            required data) in the sample sheet.

        """
        # Convience variables
        sample_id = self._sample_sheet.sample_id_column
        sample_name = self._sample_sheet.sample_name_column
        sample_project = self._sample_sheet.sample_project_column
        # Look at first line to see which items have been provided
        line = self._sample_sheet.data[0]
        has_sample_id = line[sample_id] != ''
        has_sample_name = (sample_name is not None) and \
                          (line[sample_name] != '')
        has_project = line[sample_project] != ''
        # Look for invalid data lines
        invalid_lines = []
        for line in self._sample_sheet.data:
            if self._sample_sheet.has_lanes and line['Lane'] == '':
                invalid_lines.append(line)
            elif has_sample_id and line[sample_id] == '':
                invalid_lines.append(line)
            elif has_sample_name and line[sample_name] == '':
                invalid_lines.append(line)
            elif has_project and line[sample_project] == '':
                invalid_lines.append(line)
        return invalid_lines

    def has_invalid_characters(self):
        """
        Check if text file contains any 'invalid' characters

        In this context a character is 'invalid' if:
        - it is non-ASCII (decimal code > 127), or
        - it is a non-printing ASCII character (code < 32)

        Returns:
          Boolean: True if file contains at least one invalid
            character, False if all characters are valid.

        """
        return has_invalid_characters(text=self._sample_sheet.show())

Example #17

0

Show file

 reporter = Reporter()
 for lane in lanes:
     # Report for each lane
     if lane not in counts.lanes:
         logging.error("Requested analysis for lane %d but "
                       "only have counts for lanes %s" %
                       (lane, ','.join([str(l) for l in counts.lanes])))
         retval = 1
         continue
     mismatches = opts.mismatches
     # Deal with sample sheet if supplied
     if sample_sheet:
         with tempfile.NamedTemporaryFile() as fp:
             # Make a temporary sample sheet with just the
             # requested lane
             s = SampleSheet(sample_sheet)
             if s.has_lanes:
                 use_lanes = (lane, )
                 s = make_custom_sample_sheet(sample_sheet,
                                              fp.name,
                                              lanes=(lane, ))
             else:
                 s = make_custom_sample_sheet(sample_sheet, fp.name)
             if has_chromium_sc_indices(fp.name):
                 logging.warning("Lane %s has 10xGenomics Chromium "
                                 "indices in sample sheet; not "
                                 "matching against samplesheet for "
                                 "this lane" % lane)
                 continue
             # If mismatches not set then determine from
             # the barcode lengths in the temporary

Example #18

0

Show file

def setup(ap,
          data_dir,
          analysis_dir=None,
          sample_sheet=None,
          unaligned_dir=None):
    """
    Set up the initial analysis directory

    This does all the initialisation of the analysis directory
    and processing parameters

    Arguments:
      ap (AutoProcess): autoprocessor pointing to the analysis
        directory to create Fastqs for
      data_dir (str): source data directory
      analysis_dir (str): corresponding analysis directory
      sample_sheet (str): name and location of non-default
        sample sheet file; can be a local or remote file, or
        a URL (optional, will use sample sheet from the
        source data directory if present)
      unaligned_dir (str): directory with existing Fastqs
        output from CASAVA or bcl2fastq2; if specified then
        Fastqs will be taken from this directory (optional)
    """
    data_dir = data_dir.rstrip(os.sep)
    if not exists(data_dir):
        raise Exception("Data directory '%s' not found" % data_dir)
    if not Location(data_dir).is_remote:
        data_dir = os.path.abspath(data_dir)
    run_name = os.path.basename(data_dir)
    if analysis_dir is None:
        analysis_dir = os.path.join(os.getcwd(), run_name) + '_analysis'
    else:
        analysis_dir = os.path.abspath(analysis_dir)
    # Create the analysis directory structure
    if not os.path.exists(analysis_dir):
        # Make a temporary analysis dir
        tmp_analysis_dir = os.path.join(
            os.path.dirname(analysis_dir),
            ".%s.%s" % (os.path.basename(analysis_dir), uuid.uuid4()))
        ap.analysis_dir = tmp_analysis_dir
        logger.debug("Creating temp directory '%s'" % ap.analysis_dir)
        # Create directory structure
        ap.create_directory(ap.analysis_dir)
        ap.log_dir
        ap.script_code_dir
    else:
        # Directory already exists
        logger.warning("Analysis directory '%s' already exists" % analysis_dir)
        ap.analysis_dir = analysis_dir
        # check for parameter file
        if ap.has_parameter_file:
            ap.load_parameters()
        else:
            logger.warning("No parameter file found in %s" % ap.analysis_dir)
    # Run datestamp, instrument name and instrument run number
    try:
        datestamp,instrument,run_number,flow_cell_prefix,flow_cell_id = \
                                    split_run_name_full(run_name)
        run_number = run_number.lstrip('0')
        flow_cell = flow_cell_prefix + flow_cell_id
    except Exception as ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        datestamp = None
        instrument = None
        run_number = None
        flow_cell = None
    # Identify missing data and attempt to acquire
    # Sequencing platform
    platform = ap.metadata.platform
    if platform is None:
        platform = get_sequencer_platform(data_dir,
                                          instrument=instrument,
                                          settings=ap.settings)
    print "Platform identified as '%s'" % platform
    # Log dir
    ap.set_log_dir(ap.get_log_subdir('setup'))
    # Attempt to acquire sample sheet
    try:
        # Custom SampleSheet.csv file
        custom_sample_sheet = ap.params.sample_sheet
        if custom_sample_sheet is not None:
            # Sample sheet already stored
            original_sample_sheet = os.path.join(ap.analysis_dir,
                                                 'SampleSheet.orig.csv')
            print "Sample sheet '%s'" % custom_sample_sheet
        else:
            # Look for sample sheet
            print "Acquiring sample sheet..."
            if sample_sheet is None:
                targets = (
                    'Data/Intensities/BaseCalls/SampleSheet.csv',
                    'SampleSheet.csv',
                )
            else:
                targets = (sample_sheet, )
            # Try each possibility until one sticks
            for target in targets:
                target = Location(target)
                tmp_sample_sheet = os.path.join(ap.tmp_dir,
                                                os.path.basename(target.path))
                if target.is_url:
                    # Try fetching samplesheet from URL
                    print "Trying '%s'" % target.url
                    try:
                        urlfp = urllib2.urlopen(target.url)
                        with open(tmp_sample_sheet, 'w') as fp:
                            fp.write(urlfp.read())
                    except urllib2.URLError as ex:
                        # Failed to download from URL
                        raise Exception("Error fetching sample sheet data "
                                        "from '%s': %s" % (target.url, ex))
                else:
                    # Assume target samplesheet is a file on a local
                    # or remote server
                    if target.is_remote:
                        target_sample_sheet = str(target)
                    else:
                        if os.path.isabs(target.path):
                            target_sample_sheet = target.path
                        else:
                            target_sample_sheet = os.path.join(
                                data_dir, target.path)
                    print "Trying '%s'" % target_sample_sheet
                    rsync = general_applications.rsync(target_sample_sheet,
                                                       ap.tmp_dir)
                    print "%s" % rsync
                    status = rsync.run_subprocess(
                        log=ap.log_path('rsync.sample_sheet.log'))
                    if status != 0:
                        logger.warning("Failed to fetch sample sheet '%s'" %
                                       target_sample_sheet)
                        tmp_sample_sheet = None
                    else:
                        break
            # Bail out if no sample sheet was acquired
            if tmp_sample_sheet is None:
                raise Exception("Unable to acquire sample sheet")
            # Keep a copy of the original sample sheet
            original_sample_sheet = os.path.join(ap.analysis_dir,
                                                 'SampleSheet.orig.csv')
            print "Copying original sample sheet to %s" % original_sample_sheet
            shutil.copyfile(tmp_sample_sheet, original_sample_sheet)
            # Set the permissions for the original SampleSheet
            os.chmod(original_sample_sheet, 0664)
            # Process acquired sample sheet
            custom_sample_sheet = os.path.join(ap.analysis_dir,
                                               'custom_SampleSheet.csv')
            make_custom_sample_sheet(tmp_sample_sheet, custom_sample_sheet)
    except Exception as ex:
        # Failed to acquire sample sheet
        if not unaligned_dir:
            # Fatal error
            try:
                # Remove temporary directory
                shutil.rmtree(tmp_analysis_dir)
                ap.analysis_dir = None
            except Exception:
                pass
            raise Exception("Failed to acquire sample sheet: %s" % ex)
        else:
            # Don't need sample sheet if Fastqs already exist
            original_sample_sheet = None
            custom_sample_sheet = None
    # Library Prep Kit/Assay data
    assay = None
    if original_sample_sheet is not None:
        for item in ('Assay', 'Library Prep Kit'):
            try:
                assay = SampleSheet(original_sample_sheet).header[item]
                break
            except KeyError:
                logger.warning("No element '%s' found in sample sheet" % item)
    # Bases mask
    print "Bases mask set to 'auto' (will be determined at run time)"
    bases_mask = "auto"
    # Data source metadata
    data_source = ap.settings.metadata.default_data_source
    # Generate and print predicted outputs and warnings
    if custom_sample_sheet is not None:
        sample_sheet_data = SampleSheet(custom_sample_sheet)
        print predict_outputs(sample_sheet=sample_sheet_data)
        check_and_warn(sample_sheet=sample_sheet_data)
    # Check supplied unaligned Fastq dir
    if unaligned_dir is not None:
        try:
            illumina_data = IlluminaData(data_dir, unaligned_dir=unaligned_dir)
            unaligned_dir = illumina_data.unaligned_dir
        except IlluminaDataError:
            # Fatal error
            try:
                # Remove temporary directory
                shutil.rmtree(tmp_analysis_dir)
                ap.analysis_dir = None
            except Exception:
                pass
            raise Exception("Can't get data from Fastq dir '%s'" %
                            unaligned_dir)
    else:
        # No unaligned dir supplied
        unaligned_dir = ap.params.unaligned_dir
    # Move analysis dir to final location if necessary
    if ap.analysis_dir != analysis_dir:
        logger.debug("Moving %s to final directory" % ap.analysis_dir)
        os.rename(ap.analysis_dir, analysis_dir)
        ap.analysis_dir = analysis_dir
        # Update the custom sample sheet path
        if custom_sample_sheet is not None:
            custom_sample_sheet = os.path.join(
                analysis_dir, os.path.basename(custom_sample_sheet))
        print "Created analysis directory '%s'" % ap.analysis_dir
    # Store the parameters
    ap.params['data_dir'] = data_dir
    ap.params['analysis_dir'] = ap.analysis_dir
    ap.params['sample_sheet'] = custom_sample_sheet
    ap.params['bases_mask'] = bases_mask
    ap.params['unaligned_dir'] = unaligned_dir
    ap.params['acquired_primary_data'] = False
    # Store the metadata
    ap.metadata['run_name'] = ap.run_name
    ap.metadata['platform'] = platform
    ap.metadata['instrument_name'] = instrument
    ap.metadata['instrument_datestamp'] = datestamp
    ap.metadata['instrument_run_number'] = run_number
    ap.metadata['instrument_flow_cell_id'] = flow_cell
    ap.metadata['assay'] = assay
    ap.metadata['source'] = data_source
    # Make a 'projects.info' metadata file
    if unaligned_dir is not None:
        ap.make_project_metadata_file()
    # Set flags to allow parameters etc to be saved back
    ap._save_params = True
    ap._save_metadata = True

Example #19

0

Show file

File: make_fastqs_cmd.py Project: fls-bioinformatics-core/auto_process_ngs

def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                name=None,
                lanes=None,
                lane_subsets=None,
                icell8_well_list=None,
                nprocessors=None,
                bcl_converter=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                trim_adapters=True,
                adapter_sequence=None,
                adapter_sequence_read2=None,
                create_fastq_for_index_read=None,
                find_adapters_with_sliding_window=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                force_copy_of_primary_data=False,
                create_empty_fastqs=False,
                runner=None,
                icell8_swap_i1_and_i2=False,
                icell8_reverse_complement=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False,
                max_jobs=None,
                max_cores=None,
                batch_limit=None,
                verbose=False,
                working_dir=None):
    """
    Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics
    - analyse barcodes

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      name (str): (optional) identifier for outputs that are not
        set explicitly
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      lane_subsets (list): (optional) specify a list of lane subsets
        to process separately before merging at the end; each subset
        is a dictionary which should be generated using the 'subset'
        function, and can include custom values for processing
        parameters (e.g. protocol, trimming and masking options etc)
        to override the defaults for this lane. Lanes not in a subset
        will still be processed unless excluded via the 'lanes'
        keyword
      icell8_well_list (str): well list file for ICELL8 platforms
        (required for ICELL8 processing protocols)
      nprocessors (int) : number of processors to use
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      bcl_converter (str): default BCL-to-Fastq conversion software to
        use; optionally can include a version specification (e.g.
        "bcl2fastq>2.0" or "bcl-convert=3.7.5"). Defaults to "bcl2fastq"
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      trim_adapters (boolean): if True (the default) then pass
        adapter sequence(s) to bcl2fastq to perform adapter trimming;
        otherwise remove adapter sequences
      adapter_sequence (str): if not None then specifies adapter
        sequence to use instead of any sequences already set in the
        samplesheet (nb will be ignored if 'trim_adapters' is False)
      adapter_sequence_read2 (str): if not None then specifies adapter
        sequence to use for read2 instead of any sequences already set
        in the samplesheet (nb will be ignored if 'trim_adapters' is
        False)
      create_fastq_for_index_reads (boolean): if True then also create
        Fastq files for index reads (default, don't create index read
        Fastqs)
      find_adapters_with_sliding_window (boolean): if True then use
        sliding window algorithm to identify adapter sequences for
        trimming
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      force_copy_of_primary_data (bool): if True then force primary
        data to be copied (rsync'ed) even if it's on the local system
        (default is to link to primary data unless it's on a remote
        filesystem).
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      icell8_swap_i1_and_i2 (bool): if True then swap I1 and I2 reads
        when matching to barcodes in the ICELL8 well list (ICELL8 ATAC
        data only)
      icell8_reverse_complement (str): one of 'i1', 'i2', 'both', or
        None; if set then the specified index reads will be reverse
        complemented when matching to barcodes in the ICELL8 well list
        (ICELL8 ATAC data only)
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
      max_jobs (int): maximum number of concurrent jobs allowed
      max_cores (int): maximum number of cores available
      batch_limit (int): if set then run commands in each task in
         batches, with the batch size set dyanmically so as not to
         exceed this limit
      working_dir (str): path to a working directory (defaults to
         temporary directory in the current directory)
      verbose (bool): if True then report additional information for
         pipeline diagnostics
    """
    # Report protocol
    print("Protocol              : %s" % protocol)
    if protocol not in PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join(PROTOCOLS)))

    # Output (unaligned) dir
    if not unaligned_dir and name:
        unaligned_dir = 'bcl2fastq_%s' % name
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print("Output dir            : %s" % ap.params.unaligned_dir)

    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print("Source sample sheet   : %s" % ap.params.sample_sheet)

    # Check requested lanes are actually present
    print("Lanes                 : %s" %
          ('all' if lanes is None else ','.join([str(l) for l in lanes])))
    if lanes is not None:
        s = SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)

    # Barcode analysis
    if not barcode_analysis_dir and name:
        barcode_analysis_dir = 'barcode_analysis_%s' % name
    if barcode_analysis_dir is not None:
        ap.params['barcode_analysis_dir'] = barcode_analysis_dir
    elif ap.params.barcode_analysis_dir is None:
        ap.params['barcode_analysis_dir'] = 'barcode_analysis'
    barcode_analysis_dir = ap.params.barcode_analysis_dir
    if not os.path.isabs(barcode_analysis_dir):
        barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                            barcode_analysis_dir)

    # Statistics files
    if stats_file is None:
        if name:
            stats_file = 'statistics.%s.info' % name
        elif ap.params['stats_file'] is not None:
            stats_file = ap.params['stats_file']
        else:
            stats_file = 'statistics.info'
    if per_lane_stats_file is None:
        if name:
            per_lane_stats_file = 'per_lane_statistics.%s.info' % name
        elif ap.params['per_lane_stats_file'] is not None:
            per_lane_stats_file = ap.params['per_lane_stats_file']
        else:
            per_lane_stats_file = 'per_lane_statistics.info'

    # Log dir
    log_dir = 'make_fastqs'
    if name:
        log_dir += "_%s" % name
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))

    # Pipeline log file
    pipeline_log = os.path.join(ap.log_dir, "make_fastqs.log")

    # Bases mask
    if bases_mask is not None:
        ap.params['bases_mask'] = bases_mask
    bases_mask = ap.params.bases_mask

    # Default trimming/masking values
    if minimum_trimmed_read_length is None:
        minimum_trimmed_read_length = \
                BCL2FASTQ_DEFAULTS['minimum_trimmed_read_length']
    if mask_short_adapter_reads is None:
        mask_short_adapter_reads = \
                BCL2FASTQ_DEFAULTS['mask_short_adapter_reads']

    # Get platform
    if not platform:
        platform = ap.metadata.platform

    # Set options from supplied arguments, platform-specific settings
    # and configured defaults
    defaults = {
        'bcl_converter': bcl_converter,
        'nprocessors': nprocessors,
        'no_lane_splitting': no_lane_splitting,
        'create_empty_fastqs': create_empty_fastqs,
    }
    for item in (
            'bcl_converter',
            'nprocessors',
            'no_lane_splitting',
            'create_empty_fastqs',
    ):
        if defaults[item] is None:
            value = None
            if platform in ap.settings.platform:
                value = ap.settings.platform[platform][item]
            if value is None:
                value = ap.settings.bcl_conversion[item]
            defaults[item] = value

    # BCL converter
    if defaults['bcl_converter'] is None:
        bcl_converter = BCL2FASTQ_DEFAULTS['bcl_converter']
    else:
        bcl_converter = defaults['bcl_converter']

    # Number of processors
    nprocessors = defaults['nprocessors']

    # Split Fastqs across lanes
    no_lane_splitting = defaults['no_lane_splitting']

    # Create empty Fastqs
    create_empty_fastqs = defaults['create_empty_fastqs']

    # Set up pipeline runners
    default_runner = ap.settings.general.default_runner
    runners = {
        'rsync_runner': ap.settings.runners.rsync,
        'bcl2fastq_runner': ap.settings.runners.bcl2fastq,
        'bclconvert_runner': ap.settings.runners.bcl_convert,
        'demultiplex_icell8_atac_runner': ap.settings.runners.bcl2fastq,
        'cellranger_runner': ap.settings.runners.cellranger,
        'cellranger_atac_runner': ap.settings.runners.cellranger,
        'cellranger_arc_runner': ap.settings.runners.cellranger,
        'spaceranger_runner': ap.settings.runners.cellranger,
        'stats_runner': ap.settings.runners.stats,
    }
    if runner is not None:
        # Override configured runners
        default_runner = runner
        for r in runners:
            runner[r] = runner

    # Set up pipeline environment modules
    envmodules = {}
    for envmod in (
            'bcl2fastq',
            'bcl_convert',
            'cellranger_mkfastq',
            'cellranger_atac_mkfastq',
            'cellranger_arc_mkfastq',
            'spaceranger_mkfastq',
    ):
        try:
            envmodules[envmod] = ap.settings.modulefiles[envmod]
        except KeyError:
            try:
                envmodules[envmod] = ap.settings.modulefiles['make_fastqs']
            except KeyError:
                envmodules[envmod] = None

    # Other pipeline settings
    poll_interval = ap.settings.general.poll_interval

    # Construct and run pipeline
    make_fastqs = MakeFastqs(ap.params.data_dir,
                             ap.params.sample_sheet,
                             protocol=protocol,
                             bases_mask=bases_mask,
                             bcl_converter=bcl_converter,
                             platform=platform,
                             icell8_well_list=icell8_well_list,
                             minimum_trimmed_read_length=\
                             minimum_trimmed_read_length,
                             mask_short_adapter_reads=\
                             mask_short_adapter_reads,
                             adapter_sequence=adapter_sequence,
                             adapter_sequence_read2=\
                             adapter_sequence_read2,
                             icell8_atac_swap_i1_and_i2=\
                             icell8_swap_i1_and_i2,
                             icell8_atac_reverse_complement=\
                             icell8_reverse_complement,
                             lane_subsets=lane_subsets,
                             lanes=lanes,
                             trim_adapters=trim_adapters,
                             fastq_statistics=generate_stats,
                             analyse_barcodes=analyse_barcodes)
    status = make_fastqs.run(ap.analysis_dir,
                             name=name,
                             out_dir=ap.params.unaligned_dir,
                             barcode_analysis_dir=barcode_analysis_dir,
                             primary_data_dir=ap.params.primary_data_dir,
                             force_copy_of_primary_data=\
                             force_copy_of_primary_data,
                             no_lane_splitting=no_lane_splitting,
                             create_fastq_for_index_read=\
                             create_fastq_for_index_read,
                             find_adapters_with_sliding_window=\
                             find_adapters_with_sliding_window,
                             create_empty_fastqs=create_empty_fastqs,
                             stats_file=stats_file,
                             per_lane_stats=per_lane_stats_file,
                             nprocessors=nprocessors,
                             default_runner=default_runner,
                             cellranger_jobmode=cellranger_jobmode,
                             cellranger_mempercore=cellranger_mempercore,
                             cellranger_maxjobs=cellranger_maxjobs,
                             cellranger_jobinterval=\
                             cellranger_jobinterval,
                             cellranger_localcores=cellranger_localcores,
                             cellranger_localmem=cellranger_localmem,
                             runners=runners,
                             envmodules=envmodules,
                             log_dir=ap.log_dir,
                             log_file=pipeline_log,
                             max_jobs=max_jobs,
                             max_slots=max_cores,
                             batch_limit=batch_limit,
                             poll_interval=poll_interval,
                             working_dir=working_dir,
                             verbose=verbose)

    # Update the parameters
    ap.params['primary_data_dir'] = make_fastqs.output.primary_data_dir
    ap.params['acquired_primary_data'] = \
                                    make_fastqs.output.acquired_primary_data
    ap.params['stats_file'] = make_fastqs.output.stats_file
    ap.params['per_lane_stats_file'] = make_fastqs.output.per_lane_stats
    for param in ('stats_file', 'per_lane_stats_file'):
        filen = ap.params[param]
        if filen is not None:
            if filen.startswith(ap.analysis_dir):
                ap.params[param] = os.path.relpath(filen, ap.analysis_dir)

    # Update the metadata
    if status == 0:
        # Platform
        ap.metadata['platform'] = make_fastqs.output.platform
        # Software used for processing
        try:
            processing_software = ast.literal_eval(
                ap.metadata.processing_software)
        except ValueError:
            processing_software = dict()
        outputs = make_fastqs.output
        if outputs.bcl2fastq_info:
            processing_software['bcl2fastq'] = outputs.bcl2fastq_info
        if outputs.bclconvert_info:
            processing_software['bcl-convert'] = outputs.bclconvert_info
        if outputs.cellranger_info:
            processing_software['cellranger'] = outputs.cellranger_info
        if outputs.cellranger_atac_info:
            processing_software['cellranger-atac'] = \
                                                outputs.cellranger_atac_info
        if outputs.cellranger_arc_info:
            processing_software['cellranger-arc'] = \
                                                outputs.cellranger_arc_info
        if outputs.spaceranger_info:
            processing_software['spaceranger'] = outputs.spaceranger_info
        ap.metadata['processing_software'] = processing_software
        # Legacy metadata items
        ap.metadata['bcl2fastq_software'] = make_fastqs.output.bcl2fastq_info
        ap.metadata['cellranger_software'] = make_fastqs.output.cellranger_info

    # Make a file listing missing Fastqs
    if make_fastqs.output.missing_fastqs:
        missing_fastqs_log = os.path.join(ap.log_dir, "missing_fastqs.log")
        with open(missing_fastqs_log, 'wt') as fp:
            for fq in make_fastqs.output.missing_fastqs:
                fp.write("%s\n" % fq)
        print("Wrote list of missing Fastq files to '%s'" % missing_fastqs_log)
    # Raise exception on failure
    if status != 0:
        raise Exception("Fastq generation failed")

    # Make or update 'projects.info' metadata file
    if not ap.params.project_metadata:
        ap.make_project_metadata_file()
    else:
        ap.update_project_metadata_file()
    ap.save_data()

    # Finish
    return status