Example #1
0
def get_bases_mask(run_info_xml, sample_sheet_file):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads). Then updates this using the barcode
    information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: name and path of sample sheet file.

    Returns:
      Bases mask string e.g. 'y101,I6'. 

    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    # Update bases mask from sample sheet
    example_barcode = IlluminaData.get_casava_sample_sheet(
        sample_sheet_file)[0]['Index']
    bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode)
    print "Bases mask: %s (updated for barcode sequence '%s')" % (
        bases_mask, example_barcode)
    return bases_mask
def get_bases_mask(run_info_xml,sample_sheet_file):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads). Then updates this using the barcode
    information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: name and path of sample sheet file.

    Returns:
      Bases mask string e.g. 'y101,I6'. 

    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    # Update bases mask from sample sheet
    example_barcode = IlluminaData.get_casava_sample_sheet(sample_sheet_file)[0]['Index']
    bases_mask = IlluminaData.fix_bases_mask(bases_mask,example_barcode)
    print "Bases mask: %s (updated for barcode sequence '%s')" % (bases_mask,
                                                                  example_barcode)
    return bases_mask
Example #3
0
def get_bases_mask(run_info_xml, sample_sheet_file=None):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads), and optionally updates this using the
    barcode information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: (optional) path to sample sheet file

    Returns:
      Bases mask string e.g. 'y101,I6'. 
    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    if sample_sheet_file is not None:
        # Update bases mask from sample sheet
        example_barcode = IlluminaData.samplesheet_index_sequence(
            IlluminaData.SampleSheet(sample_sheet_file).data[0])
        if example_barcode is None:
            example_barcode = ""
        if barcode_is_10xgenomics(example_barcode):
            print "Bases mask: barcode is 10xGenomics sample set ID"
        else:
            bases_mask = IlluminaData.fix_bases_mask(bases_mask,
                                                     example_barcode)
        print "Bases mask: %s (updated for barcode sequence '%s')" % \
            (bases_mask,example_barcode)
    return bases_mask
Example #4
0
def verify_fastq_generation(ap,
                            unaligned_dir=None,
                            lanes=None,
                            include_sample_dir=False):
    """Check that generated Fastqs match sample sheet predictions

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to do Fastqs verification on
      unaligned_dir (str): explicitly specify the bcl2fastq output
        directory to check
      lanes (list): specify a list of lane numbers (integers) to
        check (others will be ignored)
      include_sample_dir (bool): if True then include a
        'sample_name' directory level when checking for
         bcl2fastq2 outputs, even if one shouldn't be present

     Returns:
       True if outputs match sample sheet, False otherwise.
    """
    if unaligned_dir is None:
        if ap.params.unaligned_dir is not None:
            unaligned_dir = ap.params.unaligned_dir
        else:
            raise Exception("Bcl2fastq output directory not defined")
    print "Checking bcl2fastq output directory '%s'" % unaligned_dir
    bcl_to_fastq_dir = os.path.join(ap.analysis_dir, unaligned_dir)
    if not os.path.isdir(bcl_to_fastq_dir):
        # Directory doesn't exist
        return False
    # Make a temporary sample sheet to verify against
    tmp_sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet.verify.%s.csv" % time.strftime("%Y%m%d%H%M%S"))
    make_custom_sample_sheet(ap.params.sample_sheet,
                             tmp_sample_sheet,
                             lanes=lanes)
    # Try to create an IlluminaData object
    try:
        illumina_data = IlluminaData.IlluminaData(ap.analysis_dir,
                                                  unaligned_dir=unaligned_dir)
    except IlluminaData.IlluminaDataError as ex:
        # Failed to initialise
        logger.warning("Failed to get information from %s: %s" %
                       (bcl_to_fastq_dir, ex))
        return False
    # Do check
    return IlluminaData.verify_run_against_sample_sheet(
        illumina_data, tmp_sample_sheet, include_sample_dir=include_sample_dir)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None):
    """
    Collect Fastq files for specified lane

    Arguments:
      dirn (str): path to directory to collect Fastq
        files from
      lane (int): lane Fastqs must have come from
      unaligned_dir (str): subdirectory of 'dirn' with
        outputs from bcl2fastq

    Returns:
      List: list of Fastqs (for single ended data) or of
        Fastq pairs (for pair ended data).
    """
    try:
        illumina_data = IlluminaData.IlluminaData(dirn,
                                                  unaligned_dir=unaligned_dir)
    except Exception as ex:
        raise Exception("Unable to read fastqs from %s: %s\n" % (dirn, ex))
    paired_end = illumina_data.paired_end
    fastqs_r1 = []
    fastqs_r2 = []
    for project in illumina_data.projects:
        for sample in project.samples:
            for fastq in sample.fastq_subset(read_number=1, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r1.append(fastq)
            for fastq in sample.fastq_subset(read_number=2, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r2.append(fastq)
    if illumina_data.undetermined:
        for sample in illumina_data.undetermined.samples:
            for fastq in sample.fastq_subset(read_number=1, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r1.append(fastq)
            for fastq in sample.fastq_subset(read_number=2, full_path=True):
                if IlluminaData.IlluminaFastq(fastq).lane_number == lane:
                    fastqs_r2.append(fastq)
    if not paired_end:
        return fastqs_r1
    fastqs = []
    fastqs_r1.sort()
    fastqs_r2.sort()
    for fq1, fq2 in zip(fastqs_r1, fastqs_r2):
        fastqs.append("%s,%s" % (fq1, fq2))
    return fastqs
Example #6
0
def report_info(ap):
    """Generate a general report

    Generates an unstructured report on the contents
    of the analysis directory.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on
        
    Returns:
      String with the report text.
    """
    report = []
    report.append("Run reference: %s" % ap.run_reference_id)
    report.append("Directory    : %s" % ap.analysis_dir)
    report.append("Platform     : %s" % ap.metadata.platform)
    report.append("Unaligned dir: %s" % ap.params.unaligned_dir)
    if ap.readme_file:
        report.append("README.txt found: %s" % ap.readme_file)
    if ap.params.unaligned_dir is not None or \
       not os.path.exists(ap.params.unaligned_dir):
        try:
            illumina_data = ap.load_illumina_data()
            report.append("\nSummary of data in '%s' dir:\n" %
                          ap.params.unaligned_dir)
            for project in illumina_data.projects:
                report.append("- %s" % IlluminaData.describe_project(project))
        except IlluminaData.IlluminaDataError as ex:
            report.append("Failed to load data from %s:" %
                          ap.params.unaligned_dir)
            report.append("%s" % ex)
    else:
        report.append("No information on source fastq data (no unaligned dir "
                      "found)")
    try:
        projects = ap.get_analysis_projects()
        report.append("\n%d analysis project%s:" %
                      (len(projects), "s" if len(projects) != 0 else ""))
    except Exception as ex:
        projects = []
        report.append("\nNo analysis projects found")
    for project in projects:
        info = project.info
        report.append("\n- %s" % project.name)
        report.append("  %s" % ('-' * len(project.name), ))
        report.append("  User    : %s" % info.user)
        report.append("  PI      : %s" % info.PI)
        report.append("  Library : %s" % info.library_type)
        report.append("  SC Plat.: %s" % info.single_cell_platform)
        report.append("  Organism: %s" % info.organism)
        report.append("  Dir     : %s" % os.path.basename(project.dirn))
        report.append("  #samples: %s" % len(project.samples))
        report.append("  #cells  : %s" % default_value(info.number_of_cells))
        report.append("  Samples : %s" % project.prettyPrintSamples())
        report.append("  QC      : %s" %
                      ('ok' if verify_qc(project) else 'not verified'))
        report.append("  Comments: %s" % (project.info.comments))
    return '\n'.join(report)
def get_fastqs_from_dir(dirn, lane, unaligned_dir=None):
    """Automatically collect Fastq files for specified lane

    """
    try:
        illumina_data = IlluminaData.IlluminaData(dirn,
                                                  unaligned_dir=unaligned_dir)
    except Exception, ex:
        sys.stderr.write("Unable to read fastqs from %s: %s\n" % (dirn, ex))
        sys.exit(1)
Example #8
0
 def load_illumina_data(self, unaligned_dir=None):
     # Load and return an IlluminaData object
     if unaligned_dir is None:
         unaligned_dir = self.params.unaligned_dir
     if unaligned_dir is None:
         logging.error(
             "Unaligned directory not specified, cannot load data")
         return None
     return IlluminaData.IlluminaData(self.analysis_dir,
                                      unaligned_dir=unaligned_dir)
Example #9
0
def report_summary(ap):
    """Generate summary report suitable for bioinformaticians

    Generates a multi-line report which gives general information
    about the run, plus one-line summaries for each project, plus
    any additional information that has been recorded.

    The general information includes:

    - Platform
    - Run name
    - Run reference id
    - Processing software
    - Assay (i.e. sequencing kit)

    For each project:

    - Project subdirectory
    - Researcher (aka user)
    - PI
    - Application (aka library type)
    - Single cell prep platform (e.g. ICell8)
    - Organism
    - Number of samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on

    Returns:
      String with the report text.
    """
    # Default items to report
    report_items = [
        'Run name',
        'Reference',
        'Platform',
        'Directory',
        'Endedness',
        'Bcl2fastq',
    ]
    # Gather information
    analysis_dir = analysis.AnalysisDir(ap.analysis_dir)
    datestamp = None
    instrument = None
    run_number = None
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
Example #10
0
def run_reference_id(run_name, platform=None, facility_run_number=None):
    """Return a run reference id e.g. 'HISEQ_140701/242#22'

    The run reference code is a code that identifies the sequencing
    run, and has the general form:

    PLATFORM_DATESTAMP[/INSTRUMENT_RUN_NUMBER]#FACILITY_RUN_NUMBER

    - PLATFORM is always uppercased e.g. HISEQ, MISEQ, GA2X
    - DATESTAMP is the YYMMDD code e.g. 140701
    - INSTRUMENT_RUN_NUMBER is the run number that forms part of the
      run directory e.g. for '140701_SN0123_0045_000000000-A1BCD'
      it is '45'
    - FACILITY_RUN_NUMBER is the run number that has been assigned
      by the facility

    Note that the instrument run number is only used if it differs
    from the facility run number.

    If the platform isn't supplied then the instrument name is
    used instead, e.g.:

    'SN0123_140701/242#22'

    If the run name can't be split into components then the
    general form will be:

    [PLATFORM_]RUN_NAME[#FACILITY_RUN_NUMBER]

    depending on whether platform and/or facility run number have
    been supplied. For example for a run called 'rag_05_2017':

    'MISEQ_rag_05_2017#90'

    Arguments:
      run_name (str): the run name (can be a path)
      platform (str): the platform name (optional)
      facility_run_number (int): the run number assigned by the
        local facility (can be different from the instrument
        run number) (optional)
    """
    # Extract information from run name
    run_name = os.path.basename(os.path.normpath(run_name))
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        instrument = None
        date_stamp = None
        run_number = None
Example #11
0
def get_sequencer_platform(dirn, instrument=None, settings=None):
    """
    Return the platform for the sequencing instrument

    Attempts to identify the platform (e.g. 'hiseq', 'miseq' etc)
    for a sequencing run.

    If 'settings' is supplied then the platform is looked up
    based on the instrument names and platforms listed in the
    'sequencers' section of the configuration. If 'instrument'
    is also supplied then this is used; otherwise the instrument
    name is extracted from the supplied directory name.

    If no match can be found then there is a final attempt to
    determine the platform from the hard-coded names in the
    'bcftbx.platforms' module.

    Arguments:
      dirn (str): path to the data or analysis directory
      instrument (str): (optional) the instrument name
      settings (Settings):  (optional) a Settings instance
        with the configuration loaded

    Returns:
      String: either the platform or None, if the platform
        cannot be determined.
    """
    # Attempt to look up the instrument name
    platform = None
    if instrument is None:
        print "Extracting instrument name from directory name"
        try:
            datestamp,instrument,run_number,\
                flow_cell_prefix,flow_cell_id = \
                    IlluminaData.split_run_name_full(dirn)
        except Exception as ex:
            logging.warning("Unable to extract instrument name: " "%s" % ex)
    if instrument and settings:
        print "Identifying platform from instrument name"
        try:
            return settings.sequencers[instrument]
        except KeyError:
            # Instrument not listed in the settings
            logging.warning("Instrument name '%s' not found in "
                            "configuration file" % instrument)
    # Fall back to old method
    print "Identifying platform from data directory name"
    platform = platforms.get_sequencer_platform(dirn)
    if platform is None:
        logging.warning("Unable to identify platform from " "directory name")
    return platform
Example #12
0
 def setup(self):
     # Make output filenames
     report_file = os.path.join(self.args.barcode_analysis_dir,
                                'barcodes.report')
     xls_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.xls')
     html_file = os.path.join(self.args.barcode_analysis_dir,
                              'barcodes.html')
     # Remove existing copies, if found
     for filen in (report_file, xls_file, html_file):
         if os.path.exists(filen):
             os.remove(filen)
     # Build command to run the barcode analysis
     cmd = PipelineCommandWrapper(
         "Run analyse_barcodes.py to report barcodes",
         'analyse_barcodes.py', '--report', report_file, '--xls', xls_file,
         '--html', html_file)
     if self.args.sample_sheet:
         cmd.add_args('--sample-sheet', self.args.sample_sheet)
     if self.args.lanes:
         lanes = self.args.lanes
     elif self.args.sample_sheet:
         # Implicitly get lanes from sample sheet
         try:
             lanes = sorted(
                 set([
                     line['Lane'] for line in IlluminaData.SampleSheet(
                         self.args.sample_sheet).data
                 ]))
         except KeyError:
             # No lanes
             lanes = None
     else:
         lanes = None
     if lanes:
         cmd.add_args('--lanes', ','.join([str(l) for l in lanes]))
     if self.args.cutoff:
         cmd.add_args('--cutoff', self.args.cutoff)
     if self.args.mismatches:
         cmd.add_args('--mismatches', self.args.mismatches)
     if self.args.title:
         cmd.add_args('--title', self.args.title)
     cmd.add_args('-c')
     cmd.add_args(*self.args.counts_files)
     self.add_cmd(cmd)
     # Update the output parameters
     self.output.report_file.set(report_file)
     self.output.xls_file.set(xls_file)
     self.output.html_file.set(html_file)
Example #13
0
def report_projects(ap):
    """Generate one line reports suitable for pasting into spreadsheet

    Generate one-line report for each each project with tab-separated
    data items, suitable for injection into a spreadsheet.

    Each line has the following information:

    - Run id e.g. HISEQ_140328
    - Run number
    - Source
    - Date
    - User
    - PI
    - Application
    - Single Cell Platform
    - Organism
    - Platform
    - #Samples
    - #Cells
    - PE (yes/no)
    - Samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on
        
    Returns:
      String with the report text.
    """
    # Acquire data
    analysis_dir = utils.AnalysisDir(ap.analysis_dir)
    # General information
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
        run_number = run_number.lstrip('0')
    except Exception, ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        date_stamp = ''
        run_number = ''
Example #14
0
 def detect_unaligned_dir(self):
     # Attempt to detect an existing 'bcl2fastq' or 'Unaligned' directory
     # containing data from bcl2fastq
     for test_unaligned in ('bcl2fastq', 'Unaligned'):
         if os.path.isdir(os.path.join(self.analysis_dir, test_unaligned)):
             logging.debug(
                 "Testing subdirectory '%s' to see if it has sequence data"
                 % test_unaligned)
             try:
                 IlluminaData.IlluminaData(self.analysis_dir,
                                           unaligned_dir=test_unaligned)
                 print("Setting 'unaligned_dir' parameter to %s" %
                       test_unaligned)
                 return test_unaligned
             except IlluminaData.IlluminaDataError as ex:
                 logging.debug("Unable to load data from %s" %
                               test_unaligned)
     # Unable to detect existing data directory
     return None
Example #15
0
    def get_analysis_projects_from_dirs(self, pattern=None, strict=False):
        """
        Return a list of AnalysisProjects in the analysis directory

        Tests each of the subdirectories in the top-level of the
        analysis directory and rejects any that appear to be
        CASVAVA/bcl2fastq outputs or which don't successfully load
        as AnalysisProject instances.

        Unlike the `get_analysis_projects` method, no checking
        against the project metadata (typically in 'projects.info')
        is performed.

        If the 'pattern' is not None then it should be a simple
        pattern used to match against available names to select
        a subset of projects (see bcf_utils.name_matches).

        Arguments:
          pattern (str): optional pattern to select a subset
            of projects (default: select all projects)
          strict (bool): if True then apply strict checks on
            each discovered project directory before adding it
            to the list (default: don't apply strict checks)

        Returns:
          List: list of AnalysisProject instances.
        """
        logging.debug("Testing subdirectories to determine analysis projects")
        projects = []
        if pattern is None:
            pattern = '*'
        # Try loading each subdirectory as a project
        for dirn in bcf_utils.list_dirs(self.analysis_dir):
            # Test for bcl2fastq output
            try:
                IlluminaData.IlluminaData(self.analysis_dir,
                                          unaligned_dir=dirn)
                logging.debug("* %s: rejected" % dirn)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logging.debug("Exception when attempting to load "
                              "subdir '%s' as CASAVA/bcl2fastq output "
                              "(ignored): %s" % (dirn, ex))
            # Try loading as a project
            test_project = AnalysisProject(
                dirn, os.path.join(self.analysis_dir, dirn))
            if strict:
                # Apply strict checks
                if not test_project.is_analysis_dir:
                    logging.debug("* %s: rejected (failed strict checks)" %
                                  dirn)
                    continue
            else:
                # Basic check: are there any samples?
                if not len(test_project.samples):
                    logging.debug("* %s: rejected (no samples)" % dirn)
                    continue
            # Passed checks
            logging.debug("* %s: analysis directory" % dirn)
            if bcf_utils.name_matches(test_project.name, pattern):
                projects.append(test_project)
        return projects
Example #16
0
    def update_project_metadata_file(self,
                                     unaligned_dir=None,
                                     project_metadata_file='projects.info'):
        """
        Update project metadata file from bcl2fastq outputs

        Updates the contents of the project metadata file
        (default: "projects.info") from a bcl-to-fastq output
        directory, by adding new entries for projects in the
        bcl-to-fastq outputs which don't currently appear.

        Arguments:
          unaligned_dir (str): path to the bcl-to-fastq
            output directory relative to the analysis dir.
            Defaults to the unaligned dir stored in the
            analysis directory parameter file.
          project_metatadata_file (str): optional, path to
            the project metadata file to update
        """
        if project_metadata_file is not None:
            self.params['project_metadata'] = project_metadata_file
        logging.debug("Project metadata file: %s" %
                      self.params.project_metadata)
        filen = os.path.join(self.analysis_dir, self.params.project_metadata)
        if unaligned_dir is not None:
            self.params['unaligned_dir'] = unaligned_dir
        logging.debug("Unaligned_dir: %s" % self.params.unaligned_dir)
        illumina_data = IlluminaData.IlluminaData(
            self.analysis_dir, unaligned_dir=self.params.unaligned_dir)
        if os.path.exists(filen):
            # Load data from existing file
            logging.debug("Loading project metadata from existing file: %s" %
                          filen)
            project_metadata = ProjectMetadataFile(filen)
        else:
            # New (empty) metadata file
            logging.debug("Creating new project metadata file: %s" % filen)
            project_metadata = ProjectMetadataFile()
        # Get projects and samples
        projects = {}
        for project in illumina_data.projects:
            projects[project.name] = sorted([s.name for s in project.samples])
        # Add data from metadata file
        for line in project_metadata:
            project_name = line['Project']
            project_is_commented = project_name.startswith('#')
            # Uncomment project line for now
            project_name = project_name.lstrip('#')
            # Add to the list if not found
            if project_name not in projects:
                if project_is_commented or \
                   not os.path.exists(os.path.join(self.analysis_dir,
                                                   project_name)):
                    # Comment out project not in latest list
                    # if already commented or if project directory
                    # doesn't exist
                    project_name = "#%s" % project_name
                projects[project_name] = line['Samples'].split(',')
        # Populate/update
        for project_name in projects:
            sample_names = projects[project_name]
            if project_name not in project_metadata:
                project_metadata.add_project(project_name, sample_names)
            else:
                project_metadata.update_project(project_name,
                                                sample_names=sample_names)
        # Save
        project_metadata.save(filen)
        print("Updated project metadata file '%s'" %
              self.params.project_metadata)
                else:
                    n_fastqs = len(sample.fastq)
                    if n_fastqs == 1:
                        print "\t%s" % sample.name
                    else:
                        print "\t%s (%d fastqs)" % (sample.name,n_fastqs)
                # Print fastq names
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
Example #18
0
    def update_metadata(self):
        """
        Migrates and updates metadata values

        """
        # Migrate missing values from parameter file
        if self.has_parameter_file:
            # Migrate relevant values across
            print("Migrating metadata values from parameter file")
            for param in (
                    'platform',
                    'run_number',
                    'source',
            ):
                if param not in self.params:
                    continue
                if self.metadata[param] is None:
                    logging.debug("Importing metadata item '%s': set to "
                                  "'%s'" % (param, self.params[param]))
                    print("Importing metadata item '%s'" % param)
                    self.metadata[param] = self.params[param]
        # Run name
        if self.metadata.run_name is None:
            print("Attempting to set missing 'run_name' metadata item")
            self.metadata['run_name'] = self.run_name
        # Instrument-related metadata
        if self.metadata.instrument_name is None or \
           self.metadata.instrument_datestamp is None or \
           self.metadata.instrument_run_number is None:
            print("Attempting to set missing instrument metadata items")
            # Extract from run name
            try:
                datestamp,instrument,run_number,\
                    flow_cell_prefix,flow_cell_id = \
                    IlluminaData.split_run_name_full(self.run_name)
                if self.metadata.instrument_name is None:
                    self.metadata['instrument_name'] = instrument
                if self.metadata.instrument_datestamp is None:
                    self.metadata['instrument_datestamp'] = datestamp
                if self.metadata.instrument_run_number is None:
                    self.metadata['instrument_run_number'] = run_number
                if self.metadata.instrument_flow_cell_id is None:
                    self.metadata['instrument_flow_cell_id'] = \
                        flow_cell_prefix + flow_cell_id
            except Exception as ex:
                logging.warning(
                    "Unable to extract missing instrument metadata "
                    "from run name")
        # Sequencing platform
        if self.metadata.platform is None:
            # Attempt to look up the instrument name
            platform = get_sequencer_platform(
                self.analysis_dir,
                instrument=self.metadata.instrument_name,
                settings=self.settings)
            if platform:
                print("Setting 'platform' metadata item to %s" % platform)
                self.metadata['platform'] = platform
        # Sequencer model
        if self.metadata.sequencer_model is None:
            instrument_name = self.metadata.instrument_name
            if instrument_name:
                try:
                    self.metadata['sequencer_model'] = \
                        self.settings.sequencers[instrument_name].model
                    print("Setting 'sequencer_model' metadata item to "
                          "'%s'" % self.metadata.sequencer_model)
                except KeyError:
                    print("Unable to get sequencer model for "
                          "instrument '%s'" % instrument_name)
def demultiplex_fastq(fastq_file,barcodes,nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'
    reads.

    Arguments:
      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

    Returns:
      No return value
    """
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
            continue
        local_barcodes.append(barcode)
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'],
                                                           barcode['index'],
                                                           info.lane_number,
                                                           info.read_number,
                                                           info.set_number)
        print "\t%s\t%s" % (barcode['index'],output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
            sys.exit(1)
        output_files[barcode['index']] = open(output_file_name,'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
        return
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number,
                                                            info.read_number,
                                                            info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
        sys.exit(1)
    output_files['unbinned'] = open(unbinned_file_name,'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode,nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read)+'\n')
                matched_read = True
                break
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read)+'\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
        output_files[barcode['index']].close()
    print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))
Example #20
0
def make_custom_sample_sheet(input_sample_sheet,
                             output_sample_sheet=None,
                             lanes=None,
                             fmt=None):
    """
    Creates a corrected copy of a sample sheet file

    Creates and returns a SampleSheet object with a copy of the
    input sample sheet, with any illegal or duplicated names fixed.
    Optionally it can also: write the updated sample sheet data to a
    new file, switch the format, and include only a subset of lanes
    from the original file

    Arguments:
      input_sample_sheet (str): name and path of the original sample
        sheet file
      output_sample_sheet (str): (optional) name and path to write
        updated sample sheet to, or `None`
      lanes (list): (optional) list of lane numbers to keep in the
        output sample sheet; if `None` then all lanes will be kept
        (the default), otherwise lanes will be dropped if they don't
        appear in the supplied list
      fmt (str): (optional) format for the output sample sheet,
        either 'CASAVA' or 'IEM'; if this is `None` then the format
        of the original file will be used

    Returns:
      SampleSheet object with the data for the corrected sample
      sheet.

    """
    # Load the sample sheet data
    sample_sheet = IlluminaData.SampleSheet(input_sample_sheet)
    # Determine the column names for this format
    if sample_sheet.format == 'CASAVA':
        sample_col = 'SampleID'
        project_col = 'SampleProject'
    elif sample_sheet.format == 'IEM':
        sample_col = 'Sample_ID'
        project_col = 'Sample_Project'
    else:
        raise Exception("Unknown sample sheet format: %s" %
                        sample_sheet.format)
    # Add project names if not supplied
    for line in sample_sheet:
        if not line[project_col]:
            line[project_col] = line[sample_col]
    # Fix other problems
    sample_sheet.fix_illegal_names()
    sample_sheet.fix_duplicated_names()
    # Select subset of lanes if requested
    if lanes is not None:
        logging.debug("Updating to include only specified lanes: %s" %
                      ','.join([str(l) for l in lanes]))
        i = 0
        while i < len(sample_sheet):
            line = sample_sheet[i]
            if line['Lane'] in lanes:
                logging.debug("Keeping %s" % line)
                i += 1
            else:
                del (sample_sheet[i])
    # Write out new sample sheet
    if output_sample_sheet is not None:
        sample_sheet.write(output_sample_sheet, fmt=fmt)
    return sample_sheet
def fetch_value(ap, project, field):
    """
    Return the value of the supplied field

    Given a field name, return the value determined from
    the data in the supplied AutoProcessor and
    AnalysisProject instances.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on
      project (AnalysisProject): project to report on
      field (str): field name to return value of

    Returns:
      String: value of supplied field.
    """
    # Convenience variable for project info
    try:
        info = project.info
    except AttributeError:
        info = None
    # Generate value for supplied field name
    if field == 'datestamp':
        return IlluminaData.split_run_name(ap.run_name)[0]
    elif field == 'run_id':
        return ap.run_reference_id
    elif field == 'run_number':
        return (''
                if not ap.metadata.run_number else str(ap.metadata.run_number))
    elif field == 'source' or field == 'data_source':
        return ('' if not ap.metadata.source else ap.metadata.source)
    elif field == 'analysis_dir' or field == 'path':
        return ap.params.analysis_dir
    elif field == 'project' or field == 'project_name':
        return project.name
    elif field == 'user':
        return ('' if not info.user else info.user)
    elif field == 'PI' or field == 'pi':
        return ('' if not info.PI else info.PI)
    elif field == 'application' or field == 'library_type':
        return ('' if not info.library_type else info.library_type)
    elif field == 'single_cell_platform':
        return ('' if not info.single_cell_platform else
                info.single_cell_platform)
    elif field == 'organism':
        return ('' if not info.organism else info.organism)
    elif field == 'sequencer_platform' or field == 'platform':
        return ('' if not ap.metadata.platform else str(
            ap.metadata.platform).upper())
    elif field == 'sequencer_model':
        return ('' if not ap.metadata.sequencer_model else
                ap.metadata.sequencer_model)
    elif field == 'no_of_samples' or field == '#samples':
        return str(len(project.samples))
    elif field == 'no_of_cells' or field == '#cells':
        return ('' if not info.number_of_cells else str(info.number_of_cells))
    elif field == 'paired_end':
        return ('yes' if ap.paired_end else 'no')
    elif field == 'sample_names' or field == 'samples':
        return project.prettyPrintSamples()
    elif field == 'null' or field == '':
        return ''
    else:
        raise KeyError("'%s': unrecognised field for reporting" % field)
def report_summary(ap):
    """Generate summary report suitable for bioinformaticians

    Generates a multi-line report which gives general information
    about the run, plus one-line summaries for each project, plus
    any additional information that has been recorded.

    The general information includes:

    - Platform
    - Run name
    - Run reference id
    - Sequencer model
    - Processing software

    For each project:

    - Project subdirectory
    - Researcher (aka user)
    - PI
    - Application (aka library type)
    - Single cell prep platform (e.g. ICell8)
    - Organism
    - Number of samples

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be reported on

    Returns:
      String with the report text.
    """
    # Default items to report
    report_items = [
        'Run name',
        'Reference',
        'Platform',
        'Sequencer',
        'Directory',
        'Endedness',
        'Bcl2fastq',
    ]
    # Gather information
    analysis_dir = analysis.AnalysisDir(ap.analysis_dir)
    datestamp = None
    instrument = None
    run_number = None
    run_name = ap.run_name
    try:
        datestamp, instrument, run_number = IlluminaData.split_run_name(
            run_name)
    except Exception as ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
    if ap.metadata.platform is not None:
        platform = ap.metadata.platform.upper()
    else:
        platform = 'unknown'
    if ap.metadata.run_number is not None:
        run_number = ap.metadata.run_number
    # Processing software information
    try:
        processing_software = ast.literal_eval(ap.metadata.processing_software)
    except ValueError:
        processing_software = dict()
    if not processing_software:
        # Fallback to legacy metadata items
        try:
            processing_software['bcl2fastq'] = ast.literal_eval(
                ap.metadata.bcl2fastq_software)
        except ValueError:
            pass
        try:
            processing_software['cellranger'] = ast.literal_eval(
                ap.metadata.cellranger_software)
        except ValueError:
            pass
    for pkg in ('cellranger', 'cellranger-atac'):
        if pkg in processing_software:
            report_items.append(pkg.title())
    # Generate report text
    report = []
    # Report header
    if datestamp and instrument and run_number:
        title = "%s run #%s datestamped %s" % (platform, run_number, datestamp)
    else:
        title = "%s" % os.path.basename(ap.analysis_dir)
    report.append("%s\n%s" % (title, '=' * len(title)))
    # General information
    field_width = max([len(i) for i in report_items])
    for item in report_items:
        # Get the value for each item
        if item == 'Run name':
            value = run_name
        elif item == 'Reference':
            value = ap.run_reference_id
        elif item == 'Platform':
            value = platform
        elif item == 'Sequencer':
            value = ap.metadata.sequencer_model
        elif item == 'Directory':
            value = ap.params.analysis_dir
        elif item == 'Endedness':
            value = ('Paired end' if analysis_dir.paired_end else 'Single end')
        elif item == 'Bcl2fastq':
            if 'bcl2fastq' in processing_software:
                value = "%s %s" % (processing_software['bcl2fastq'][1],
                                   processing_software['bcl2fastq'][2])
            else:
                value = 'Unknown'
        elif item == 'Cellranger':
            if 'cellranger' in processing_software:
                value = "%s %s" % (processing_software['cellranger'][1],
                                   processing_software['cellranger'][2])
            else:
                value = 'Unknown'
        elif item == 'Cellranger-Atac':
            if 'cellranger-atac' in processing_software:
                value = "%s %s" % (processing_software['cellranger-atac'][1],
                                   processing_software['cellranger-atac'][2])
            else:
                value = 'Unknown'
        else:
            raise Exception("Unknown reporting item '%s'" % item)
        # Append a line reporting the value
        report.append("%s%s: %s" % (item, ' ' *
                                    (field_width - len(item)), value))
    report.append("")
    # Projects
    rows = []
    comments = bcf_utils.OrderedDictionary()
    if analysis_dir.n_projects != 0:
        report.append("%d project%s:" %
                      (analysis_dir.n_projects,
                       '' if analysis_dir.n_projects == 1 else 's'))
        data_items = ('user', 'PI', 'library_type', 'single_cell_platform',
                      'number_of_cells', 'organism')
        for project in analysis_dir.projects:
            project_data = dict(project=project.name)
            for item in data_items:
                value = project.info[item]
                project_data[item] = value if value not in ('.','?') else \
                                     '<unspecified %s>' % item.lower()
            library = project_data['library_type']
            if project_data['single_cell_platform'] is not None:
                library += " (%s)" % project_data['single_cell_platform']
            samples = "%d sample%s" % (len(
                project.samples), 's' if len(project.samples) != 1 else '')
            if project_data['number_of_cells'] is not None:
                samples += "/%d cell%s" % (
                    int(project_data['number_of_cells']),
                    's' if int(project_data['number_of_cells']) != 1 else '')
            rows.append(("- '%s':" % project_data['project'],
                         project_data['user'], project_data['organism'],
                         library, samples, "(PI %s)" % project_data['PI']))
            if project.info.comments:
                comments[project.name] = project.info.comments
        report.append(utils.pretty_print_rows(rows))
    else:
        # No projects - try loading data from unaligned dir
        try:
            illumina_data = ap.load_illumina_data()
            report.append("No projects found; '%s' directory contains "
                          "the following data:\n" % ap.params.unaligned_dir)
            for project in illumina_data.projects:
                rows.append(("- '%s':" % project.name, "%s sample%s" %
                             (len(project.samples),
                              's' if len(project.samples) != 1 else '')))
            report.append(utils.pretty_print_rows(rows))
        except IlluminaData.IlluminaDataError as ex:
            report.append("No projects found")
    # Additional comments/notes
    if comments:
        width = max([len(x) for x in comments])
        report.append("")
        report.append("Additional notes/comments:")
        for project in comments:
            first_line = True
            for line in bcf_utils.split_into_lines(comments[project],
                                                   70 - width):
                if first_line:
                    report.append("- %s%s: %s" %
                                  (project, ' ' *
                                   (width - len(project)), line))
                    first_line = False
                else:
                    report.append("  %s  %s" % (' ' * width, line))
    return '\n'.join(report)
Example #23
0
 p.add_option_group(deprecated_options)
 # Process command line
 options, args = p.parse_args()
 if len(args) != 1:
     p.error("input is a single SampleSheet.csv file")
 if options.miseq:
     logging.warning(
         "--miseq option no longer necessary; MiSEQ-style sample sheets "
         "are now converted automatically")
 # Get input sample sheet file
 samplesheet = args[0]
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the data as CSV
 data = IlluminaData.get_casava_sample_sheet(samplesheet)
 # Remove lanes
 if options.lanes is not None:
     lanes = parse_lane_expression(options.lanes)
     print "Keeping lanes %s, removing the rest" % ','.join(
         [str(x) for x in lanes])
     new_data = IlluminaData.CasavaSampleSheet()
     for line in data:
         if line['Lane'] in lanes:
             print "Keeping %s" % line
             new_data.append(tabdata="%s" % line)
     data = new_data
 # Update the SampleID and SampleProject fields
 for sample_id in options.sample_id:
     lanes, name = parse_name_expression(sample_id)
     for line in data:
Example #24
0
def check_barcode_collisions(sample_sheet_file, nmismatches):
    """
    Check sample sheet for barcode collisions

    Check barcode index sequences within each lane (or across
    all samples, if no lane information is present) and find
    any which differ in fewer bases than a threshold number
    which is calculated as:

    less than 2 times the number of mismatches plus 1

    (as is stated in the output from bcl2fastq v2.)

    Pairs of barcodes which are too similar (i.e. which collide)
    are reported as a list of tuples, e.g.

    [('ATTCCT','ATTCCG'),...]

    Arguments:
      sample_sheet_file (str): path to a SampleSheet.csv file
        to analyse for barcode collisions
      nmismatches (int): maximum number of mismatches to allow

    Returns:
      List: list of pairs of colliding barcodes (with each pair
        wrapped in a tuple), or an empty list if no collisions
        were detected.

    """
    # Load the sample sheet data
    sample_sheet = IlluminaData.SampleSheet(sample_sheet_file)
    # List of index sequences (barcodes)
    barcodes = {}
    has_lanes = sample_sheet.has_lanes
    for line in sample_sheet:
        # Lane
        if has_lanes:
            lane = line['Lane']
        else:
            lane = 1
        # Index sequence
        try:
            # Try dual-indexed IEM4 format
            indx = "%s%s" % (line['index'].strip(), line['index2'].strip())
        except KeyError:
            # Try single indexed IEM4 (no index2)
            try:
                indx = line['index'].strip()
            except KeyError:
                # Try CASAVA format
                try:
                    indx = line['Index'].strip()
                except KeyError:
                    # No index columns
                    indx = ""
        # Explicitly set empty index to None
        if not indx:
            indx = None
        try:
            barcodes[lane].append(indx)
        except KeyError:
            barcodes[lane] = [
                indx,
            ]
    # Mismatch threshold
    mismatch_threshold = 2 * nmismatches + 1
    # Check for collisions
    collisions = []
    for lane in barcodes:
        for i, seq1 in enumerate(barcodes[lane][:-1]):
            for seq2 in barcodes[lane][i + 1:]:
                ndiff = 0
                for c1, c2 in zip(seq1, seq2):
                    if c1 != c2:
                        ndiff += 1
                if ndiff < mismatch_threshold:
                    collisions.append((seq1, seq2))
    return collisions
                                 "IEM sample sheet to older format)")
 p.add_argument('sample_sheet',metavar="SAMPLE_SHEET",
                help="input sample sheet file")
 # Process command line
 args = p.parse_args()
 if args.miseq:
     logging.warning("--miseq option no longer necessary; "
                     "MiSEQ-style sample sheets are now converted "
                     "automatically")
 # Get input sample sheet file
 samplesheet = args.sample_sheet
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the sample sheet
 data = IlluminaData.SampleSheet(samplesheet)
 if data.format is None:
     logging.error("Unable to determine samplesheet format")
     sys.exit(1)
 print "Sample sheet format: %s" % data.format
 # Remove lanes
 if args.lanes is not None:
     if not data.has_lanes:
         logging.error("sample sheet doesn't define any lanes")
         sys.exit(1)
     lanes = parse_lanes(args.lanes)
     print "Keeping lanes %s, removing the rest" % \
         ','.join([str(x) for x in lanes])
     i = 0
     while i < len(data):
         line = data[i]
Example #26
0
        help="check CASAVA outputs against those expected for SAMPLE_SHEET")
    p.add_option("--stats",
                 action="store_true",
                 dest="stats",
                 help="Report statistics (read counts etc) for fastq files")
    # Parse command line
    options, args = p.parse_args()

    # Get data directory name
    if len(args) != 1:
        p.error("expected one argument (location of Illumina analysis dir)")
    illumina_analysis_dir = os.path.abspath(args[0])

    # Populate Illumina data object
    try:
        illumina_data = IlluminaData.IlluminaData(
            illumina_analysis_dir, unaligned_dir=options.unaligned_dir)
    except IlluminaData.IlluminaDataError, ex:
        logging.error("Failed to collect data: %s", ex)
        sys.exit(1)

    # Check there's at least one thing to do
    if not (options.report or options.summary or options.list
            or options.sample_sheet or options.merge_fastqs):
        options.report = True

    # List option
    if options.list:
        for project in illumina_data.projects:
            n_samples = len(project.samples)
            print "Project: %s (%d sample%s)" % (project.name, n_samples,
                                                 's' if n_samples != 1 else '')
                 "'NY_ChIP-seq'. Use multiple --expt=... to set the types for different "
                 "projects")
    p.add_option("--keep-names",action="store_true",dest="keep_names",default=False,
                 help="preserve the full names of the source fastq files when creating links")
    p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False,
                 help="create merged fastq files for each set of replicates detected")
    # Parse command line
    options,args = p.parse_args()

    # Get data directory name
    if len(args) != 1:
        p.error("expected one argument (location of Illumina analysis dir)")
    illumina_analysis_dir = os.path.abspath(args[0])

    # Populate Illumina data object
    illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir,
                                              unaligned_dir=options.unaligned_dir)

    # Assign experiment types
    for expt in options.expt_type:
        name,type_ = expt.split(':')
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        create_analysis_dir(project,
                            top_dir=illumina_analysis_dir,
                            merge_replicates=options.merge_replicates,
                            keep_names=options.keep_names,
                            dry_run=options.dry_run)

def create_analysis_dir(project,
                        top_dir=None,
                        merge_replicates=False,
                        keep_names=False,
                        dry_run=False):
    """Create and populate analysis directory for an IlluminaProject

    Creates a new directory and populates either with links to FASTQ
    files, or with 'merged' FASTQ files created by concatenating
    multiple FASTQs for each sample (which can happen for multiplexed
    runs where samples are split across multiple lanes).

    Project directory names are made up of the project name and then
    the experiment type, or just the project name if experiment type
    is not set.

    Arguments:
      project   : populated IlluminaProject object
      top_dir   : parent directory to create analysis subdirectory
                  under. Defaults to cwd if not explicitly specified
      merge_replicates: if True then creates a single FASTQ file for
                  each sample by merging multiple FASTQs together
      keep_names: if True then links to FASTQ files will have the same
                  names as the original files; by default links use the
                  shortest unique name
      dry_run   : if True then report what would be done but don't
                  actually perform any action

    Returns:
      Name of the project directory.
    
    """
    project_dir = os.path.join(top_dir,project.full_name)
    print "Creating analysis directory for project '%s'..." % project.full_name
    # Check for & create directory
    if os.path.exists(project_dir):
        print "-> %s already exists" % project_dir
    else:
        print "Making analysis directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(project_dir,mode=0775)
    # Make an empty ScriptCode directory
    scriptcode_dir = os.path.join(project_dir,"ScriptCode")
    if os.path.exists(scriptcode_dir):
        print "'ScriptCode' directory %s already exists" % scriptcode_dir
    else:
        print "Making 'ScriptCode' directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(scriptcode_dir,mode=0775)
    # Check for & create links to fastq files
    if not merge_replicates:
        for sample in project.samples:
            fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq)
            for fastq in sample.fastq:
                fastq_file = os.path.join(sample.dirn,fastq)
                if keep_names:
                    fastq_ln = os.path.join(project_dir,fastq)
                else:
                    fastq_ln = os.path.join(project_dir,fastq_names[fastq])
                if os.path.exists(fastq_ln):
                    logging.error("Failed to link to %s: %s already exists" %
                                  (fastq_file,os.path.basename(fastq_ln)))
                else:
                    print "Linking to %s" % fastq
                    if not dry_run:
                        bcf_utils.mklink(fastq_file,fastq_ln,relative=True)
    else:
        # Merge files for replicates within each sample
        for sample in project.samples:
            replicates = {}
            # Gather replicates to be merged
            for fastq in sample.fastq:
                fastq_data = IlluminaData.IlluminaFastq(fastq)
                name = "%s_%s_R%d" % (fastq_data.sample_name,
                                      fastq_data.barcode_sequence,
                                      fastq_data.read_number)
                if name not in replicates:
                    replicates[name] = []
                replicates[name].append(os.path.join(sample.dirn,fastq))
                # Sort into order
                replicates[name].sort()
            # Report detected replicates
            print "Sample %s" % sample.name
            for name in replicates:
                print "\tReplicate '%s'" % name
                for fastq in replicates[name]:
                    print "\t\t%s" % fastq
            # Do the merge
            for name in replicates:
                merged_fastq = os.path.join(project_dir,name+'.fastq')
                bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name])
    # Return directory name
    return project_dir
Example #29
0
def main():
    p = optparse.OptionParser(
        usage="%prog [OPTIONS] ILLUMINA_RUN_DIR OUTPUT_DIR [ SAMPLE_SHEET ]",
        version="%prog "+__version__,
        description="Wrapper to automate the Illumina bcl to fastq "
        "conversion process. It will either run the CASAVA/bcl2fastq v1.8 "
        "configureBclToFastq.pl/make pipeline or bcl2fastq v2 directly, "
        "depending on which software package is detected. ILLUMINA_RUN_DIR "
        "is the top-level directory of the Illumina run to be processed; "
        "output will be written to OUTPUT_DIR. Optionally a SAMPLE_SHEET "
        "file can also be specified, otherwise the SampleSheet.csv file in "
        "the BaseCalls directory will be used (if present).")
    # Options common to both bcl2fastq/bcl2fastq v2
    p.add_option('--nmismatches',action="store",dest="nmismatches",
                 default=None,
                 help="set number of mismatches to allow; recommended "
                 "values are 0 for samples without multiplexing, 1 for "
                 "multiplexed samples with tags of length 6 or longer "
                 "(CASAVA/bcl2fastq v1.8 --mismatches option, bcl2fastq "
                 "v2 --barcode-mismatches option)")
    p.add_option('--use-bases-mask',action="store",dest="bases_mask",
                 default=None,
                 help="specify a bases-mask string to tell CASAVA how "
                 "to use each cycle (the supplied value is passed "
                "to the --use-bases-mask option)")
    p.add_option('--nprocessors',action="store",dest="nprocessors",
                 default=None,
                 help="set the number of processors to use (defaults to "
                 "1; for CASAVA/bcl2fastq v1.8 this is passed to the "
                 "-j option of the 'make' step after running "
                 "configureBcltoFastq.pl, for bcl2fastq v2 this is "
                 "the maximum number of CPUs that should be used by "
                 "the -r, -d, -p and -w options)")
    p.add_option('--ignore-missing-bcl',action="store_true",
                 dest="ignore_missing_bcl",default=False,
                 help="interpret missing bcl files as no call "
                 "(CASAVA/bcl2fastq v1.8 --ignore-missing-bcl option, "
                 "bcl2fastq v2 --ignore-missing-bcls option)")
    p.add_option('--bcl2fastq_path',action="store",
                 dest="bcl2fastq_path",default=None,
                 help="explicitly specify the path to the CASAVA or "
                 "bcl2fastq software to use.")
    # CASAVA/bcl2fastq 1.8.* only
    casava = optparse.OptionGroup(p,'CASAVA/bcl2fastq v1.8 only')
    casava.add_option('--ignore-missing-stats',action="store_true",
                      dest="ignore_missing_stats",default=False,
                      help="fill in with zeroes when *.stats files are missing "
                      "(see the CASAVA user guide for details of how "
                      "--ignore-missing-stats works)")
    casava.add_option('--ignore-missing-control',action="store_true",
                      dest="ignore_missing_control",default=False,
                 help="interpret missing control files as not-set control "
                      "bits (see the CASAVA user guide for details of how "
                      "--ignore-missing-control works)")
    p.add_option_group(casava)
    # bcl2fastq 2 only
    bcl2fastq2 = optparse.OptionGroup(p,'bcl2fastq v2 only')
    bcl2fastq2.add_option('--no-lane-splitting',action="store_true",
                          dest="no_lane_splitting",default=False,
                          help="Don't split output FASTQ files by lane")
    # Adapter trimming (bcl2fastq 2 only)
    adapter_trimming = optparse.OptionGroup(p,'Adapter trimming (bcl2fastq v2 only)')
    adapter_trimming.add_option('--minimum-trimmed-read-length',action="store",
                                dest="minimum_trimmed_read_length",default=35,
                                help="Minimum read length after adapter "
                                "trimming. bcl2fastq trims the adapter from "
                                "the read down to this value; if there is more "
                                "adapter match below this length then those "
                                "bases are masked not trimmed (i.e. replaced "
                                "by N rather than removed) (default: 35)")
    adapter_trimming.add_option('--mask-short-adapter-reads',action="store",
                                dest="mask_short_adapter_reads",default=22,
                                help="minimum length of unmasked bases that "
                                "a read can be after adapter trimming; reads "
                                "with fewer ACGT bases will be completely "
                                "masked with Ns (default: 22)")
    p.add_option_group(adapter_trimming)
    # Advanced options
    advanced = optparse.OptionGroup(p,'Advanced options')
    advanced.add_option('--platform',action="store",
                        dest="platform",default=None,
                        help="Explicitly specify platform; only use this if "
                        "the platform can't be read from the instrument name")
    p.add_option_group(advanced)

    options,args = p.parse_args()
    if not (2 <= len(args) <=3):
        p.error("input is an input directory, output directory and an "
                "optional sample sheet")
    # Acquire bcl2fastq software
    bcl2fastq = available_bcl2fastq_versions(paths=(options.bcl2fastq_path,))
    if not bcl2fastq:
        logging.error("No bcl2fastq software found")
        return 1
    else:
        bcl2fastq_exe = bcl2fastq[0]
    # Determine bcl2fastq version
    bcl2fastq_info = bcl_to_fastq_info(bcl2fastq_exe)
    if bcl2fastq_info[0] is None:
        logging.error("No bcl2fastq software found")
        return 1
    print "Using conversion software from %s" % os.path.dirname(
        bcl2fastq_info[0])
    # Return with error code if no version detected
    bcl2fastq_package = bcl2fastq_info[1]
    bcl2fastq_version = bcl2fastq_info[2]
    if bcl2fastq_version is None:
        logging.error("Cannot determine bcl2fastq software version")
        return 1
    print "Package: %s" % bcl2fastq_package
    print "Version: %s" % bcl2fastq_version
    known_version = None
    for version in BCL2FASTQ_VERSIONS:
        if bcl2fastq_version.startswith("%s." % version):
            known_version = version
            break
    if known_version is None:
        # Unimplemented version
        logging.error("Don't know how to run bcl2fastq version %s" %
                      bcl2fastq_version)
        return 1
    # Locate run directory (and strip any trailing slash)
    illumina_run_dir = os.path.abspath(args[0].rstrip(os.sep))
    if not os.path.isdir(illumina_run_dir):
        logging.error("%s: doesn't exist or is not a directory" %
                      illumina_run_dir)
        sys.exit(1)
    illumina_run = IlluminaData.IlluminaRun(illumina_run_dir,
                                            options.platform)
    # Output directory
    output_dir = os.path.abspath(args[1].rstrip(os.sep))
    # Sample sheet
    if len(args) == 3:
        sample_sheet = os.path.abspath(args[2])
    else:
        sample_sheet = illumina_run.sample_sheet_csv
    # Bases mask
    if options.bases_mask is not None:
        bases_mask = options.bases_mask
    else:
        bases_mask = IlluminaData.IlluminaRunInfo(
            illumina_run.runinfo_xml).bases_mask
    # Report settings
    print "Illumina run directory  : %s" % illumina_run.run_dir
    print "Basecalls directory     : %s" % illumina_run.basecalls_dir
    print "Platform                : %s" % illumina_run.platform
    print "Bcl file extension      : %s" % illumina_run.bcl_extension
    print "SampleSheet.csv file    : %s" % sample_sheet
    print "Output dir              : %s" % output_dir
    print "Nmismatches             : %s" % options.nmismatches
    print "Bases mask              : %s" % bases_mask
    print "Nprocessors             : %s" % options.nprocessors
    print "Ignore missing bcl      : %s" % options.ignore_missing_bcl
    if known_version == '1.8':
        print "Ignore missing stats    : %s" % options.ignore_missing_stats
        print "Ignore missing control  : %s" % options.ignore_missing_control
    elif known_version in ('2.17','2.20',):
        print "No lane splitting       : %s" % options.no_lane_splitting
        print "Min trimmed read length : %s" % \
            options.minimum_trimmed_read_length
        print "Mask short adapter reads: %s" % \
            options.mask_short_adapter_reads
    # Run bclToFastq conversion based on the version
    if known_version in ('1.8',):
        # 1.8.* pipeline
        status = run_bcl2fastq_1_8(
            illumina_run.basecalls_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            nprocessors=options.nprocessors,
            force=True,
            ignore_missing_bcl=options.ignore_missing_bcl,
            ignore_missing_stats=options.ignore_missing_stats,
            ignore_missing_control=options.ignore_missing_control
        )
    elif known_version in ('2.17',):
        # bcl2fastq 2.17.*
        if options.nprocessors is not None:
            # Explicitly set number of threads for each stage
            nprocessors=int(options.nprocessors)
            loading_threads=min(4,nprocessors)
            writing_threads=min(4,nprocessors)
            demultiplexing_threads=max(int(float(nprocessors)*0.2),
                                       nprocessors)
            processing_threads=nprocessors
            print "Explicitly setting number of threads for each stage:"
            print "Loading (-r)       : %d" % loading_threads
            print "Demultiplexing (-d): %d" % demultiplexing_threads
            print "Processing (-p)    : %d" % processing_threads
            print "Writing (-w)       : %d" % writing_threads
        else:
            # Use the defaults
            loading_threads = None
            demultiplexing_threads = None
            processing_threads = None
            writing_threads = None
        # Run the bcl to fastq conversion
        status = run_bcl2fastq_2_17(
            illumina_run.run_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            ignore_missing_bcl=options.ignore_missing_bcl,
            no_lane_splitting=options.no_lane_splitting,
            minimum_trimmed_read_length=options.minimum_trimmed_read_length,
            mask_short_adapter_reads=options.mask_short_adapter_reads,
            loading_threads=loading_threads,
            demultiplexing_threads=demultiplexing_threads,
            processing_threads=processing_threads,
            writing_threads=writing_threads
        )
    elif known_version in ('2.20',):
        # bcl2fastq 2.20.*
        if options.nprocessors is not None:
            # Explicitly set number of threads for each stage
            nprocessors=int(options.nprocessors)
            loading_threads=min(4,nprocessors)
            writing_threads=min(4,nprocessors)
            processing_threads=nprocessors
            print "Explicitly setting number of threads for each stage:"
            print "Loading (-r)       : %d" % loading_threads
            print "Processing (-p)    : %d" % processing_threads
            print "Writing (-w)       : %d" % writing_threads
        else:
            # Use the defaults
            loading_threads = None
            processing_threads = None
            writing_threads = None
        # Run the bcl to fastq conversion
        status = run_bcl2fastq_2_20(
            illumina_run.run_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            ignore_missing_bcl=options.ignore_missing_bcl,
            no_lane_splitting=options.no_lane_splitting,
            minimum_trimmed_read_length=options.minimum_trimmed_read_length,
            mask_short_adapter_reads=options.mask_short_adapter_reads,
            loading_threads=loading_threads,
            processing_threads=processing_threads,
            writing_threads=writing_threads
        )
    print "bclToFastq returncode: %s" % status
    if status != 0:
        logging.error("bclToFastq failure")
    return status
 p.add_option('-N','--nprocessors',action="store",dest="cores",default=1,type='int',
              help="spread work across multiple processors/cores (default is 1)")
 options,args = p.parse_args()
 # Check arguments
 if not args and options.counts_file_in is None:
     p.error("Need to supply at least one input Fastq file, a bclToFastq output "
             "directory, or a counts file from a previous run (if using -c)")
 if options.report_file is not None:
     print "Writing report to %s" % options.report_file
     fp = open(options.report_file,'w')
 else:
     fp = sys.stdout
 # Handle input sample sheet
 if options.sample_sheet is not None:
     print "Loading sample sheet data from %s" % options.sample_sheet
     sample_sheet = IlluminaData.get_casava_sample_sheet(options.sample_sheet)
 # Process according to inputs
 if options.counts_file_in:
     # Use counts from a previously generated file
     counts_file = options.counts_file_in
     print "Loading counts from %s" % counts_file
     counts = dict()
     for line in open(counts_file,'r'):
         seq = line.split('\t')[1]
         count = int(line.split('\t')[2])
         counts[seq] = count
     report(counts,nseqs=options.n,cutoff=options.cutoff,fp=fp)
     # Match barcodes to index sequences in sample sheet
     if options.sample_sheet:
         if options.lanes is not None:
             lanes = [int(lane) for lane in options.lanes.split(',')]
                   "when creating links")
    p.add_argument("--merge-replicates",action="store_true",
                   dest="merge_replicates",default=False,
                   help="create merged fastq files for each set of "
                   "replicates detected")
    p.add_argument('illumina_data_dir',
                   help="top-level directory containing the 'Unaligned' "
                   "directory with the fastq.gz files")
    # Parse command line
    args = p.parse_args()

    # Get data directory name
    illumina_analysis_dir = os.path.abspath(args.illumina_data_dir)

    # Populate Illumina data object
    illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir,
                                              unaligned_dir=args.unaligned_dir)

    # Assign experiment types
    for expt in args.expt_type:
        name,type_ = expt.split(':')
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        create_analysis_dir(project,
                            top_dir=illumina_analysis_dir,
                            merge_replicates=args.merge_replicates,
                            keep_names=args.keep_names,
                            dry_run=args.dry_run)

Example #32
0
    def __init__(self, unaligned_dir=None):
        """
        Create a new AnalyseBarcodes pipeline instance

        Arguments:
          unaligned_dir (str): path to the directory
            with outputs from bcl2fastq
        """
        # Initialise the pipeline superclass
        Pipeline.__init__(self, name="Analyse Barcodes")

        # Define parameters
        self.add_param('barcode_analysis_dir', type=str)
        self.add_param('counts_dir', type=str)
        self.add_param('title', type=str)
        self.add_param('lanes', type=list)
        self.add_param('sample_sheet', type=str)
        self.add_param('bases_mask', type=str)
        self.add_param('mismatches', type=int)
        self.add_param('cutoff', type=float)
        self.add_param('force', type=bool, value=False)

        # Load data from bcl2fastq output
        if not os.path.exists(unaligned_dir):
            raise OSError("'%s': not found" % unaligned_dir)
        analysis_dir = os.path.abspath(os.path.dirname(unaligned_dir))
        unaligned_dir = os.path.basename(unaligned_dir)
        illumina_data = IlluminaData.IlluminaData(analysis_dir,
                                                  unaligned_dir=unaligned_dir)

        # Example Fastq file used for determining mismatches in
        # absence of bases mask
        example_fastq = illumina_data.projects[0].samples[0].fastq_subset(
            read_number=1, full_path=True)[0]

        ####################
        # Build the pipeline
        ####################

        # Setup barcode analysis and counts directories
        setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs(
            "Setup barcode analysis directory",
            self.params.barcode_analysis_dir,
            self.params.counts_dir,
            force=self.params.force)
        self.add_task(setup_barcode_analysis_dir)

        # Generate counts for Fastqs in each project
        count_tasks = []
        for project in illumina_data.projects:
            count_barcodes = CountBarcodes("Count barcodes in '%s'" %
                                           project.name,
                                           project,
                                           self.params.counts_dir,
                                           lanes=self.params.lanes)
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir, ))
            count_tasks.append(count_barcodes)

        # Generate counts for undetermined Fastqs
        if illumina_data.undetermined is not None:
            count_barcodes = CountBarcodes("Count barcodes in 'undetermined'",
                                           illumina_data.undetermined,
                                           self.params.counts_dir,
                                           lanes=self.params.lanes,
                                           use_project_name="undetermined")
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir, ))
            count_tasks.append(count_barcodes)

        # List the counts files
        list_counts_files = ListBarcodeCountFiles(
            "Fetch the barcode counts files", self.params.counts_dir)
        self.add_task(list_counts_files, requires=count_tasks)

        # Analyse counts and report the results
        report_barcodes = ReportBarcodeAnalysis(
            "Report barcode analysis",
            list_counts_files.output.counts_files,
            self.params.barcode_analysis_dir,
            sample_sheet=self.params.sample_sheet,
            lanes=self.params.lanes,
            mismatches=self.params.mismatches,
            cutoff=self.params.cutoff,
            title=self.params.title)
        self.add_task(report_barcodes, requires=(list_counts_files, ))

        # Add final outputs to the pipeline
        self.add_output('report_file', report_barcodes.output.report_file)
        self.add_output('xls_file', report_barcodes.output.xls_file)
        self.add_output('html_file', report_barcodes.output.html_file)
Example #33
0
                               "if required)")
 p.add_option_group(deprecated_options)
 # Process command line
 options,args = p.parse_args()
 if len(args) != 1:
     p.error("input is a single SampleSheet.csv file")
 if options.miseq:
     logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets "
                     "are now converted automatically")
 # Get input sample sheet file
 samplesheet = args[0]
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the data as CSV
 data = IlluminaData.get_casava_sample_sheet(samplesheet)
 # Remove lanes
 if options.lanes is not None:
     lanes = parse_lane_expression(options.lanes)
     print "Keeping lanes %s, removing the rest" % ','.join([str(x) for x in lanes])
     new_data = IlluminaData.CasavaSampleSheet()
     for line in data:
         if line['Lane'] in lanes:
             print "Keeping %s" % line
             new_data.append(tabdata="%s" % line)
     data = new_data
 # Update the SampleID and SampleProject fields
 for sample_id in options.sample_id:
     lanes,name = parse_name_expression(sample_id)
     for line in data:
         if line['Lane'] in lanes: