Beispiel #1
0
def create_analysis_dir(project,
                        top_dir=None,
                        merge_replicates=False,
                        keep_names=False,
                        dry_run=False):
    """Create and populate analysis directory for an IlluminaProject

    Creates a new directory and populates either with links to FASTQ
    files, or with 'merged' FASTQ files created by concatenating
    multiple FASTQs for each sample (which can happen for multiplexed
    runs where samples are split across multiple lanes).

    Project directory names are made up of the project name and then
    the experiment type, or just the project name if experiment type
    is not set.

    Arguments:
      project   : populated IlluminaProject object
      top_dir   : parent directory to create analysis subdirectory
                  under. Defaults to cwd if not explicitly specified
      merge_replicates: if True then creates a single FASTQ file for
                  each sample by merging multiple FASTQs together
      keep_names: if True then links to FASTQ files will have the same
                  names as the original files; by default links use the
                  shortest unique name
      dry_run   : if True then report what would be done but don't
                  actually perform any action

    Returns:
      Name of the project directory.
    
    """
    project_dir = os.path.join(top_dir,project.full_name)
    print "Creating analysis directory for project '%s'..." % project.full_name
    # Check for & create directory
    if os.path.exists(project_dir):
        print "-> %s already exists" % project_dir
    else:
        print "Making analysis directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(project_dir,mode=0775)
    # Make an empty ScriptCode directory
    scriptcode_dir = os.path.join(project_dir,"ScriptCode")
    if os.path.exists(scriptcode_dir):
        print "'ScriptCode' directory %s already exists" % scriptcode_dir
    else:
        print "Making 'ScriptCode' directory for %s" % project.name
        if not dry_run:
            bcf_utils.mkdir(scriptcode_dir,mode=0775)
    # Check for & create links to fastq files
    if not merge_replicates:
        for sample in project.samples:
            fastq_names = IlluminaData.get_unique_fastq_names(sample.fastq)
            for fastq in sample.fastq:
                fastq_file = os.path.join(sample.dirn,fastq)
                if keep_names:
                    fastq_ln = os.path.join(project_dir,fastq)
                else:
                    fastq_ln = os.path.join(project_dir,fastq_names[fastq])
                if os.path.exists(fastq_ln):
                    logging.error("Failed to link to %s: %s already exists" %
                                  (fastq_file,os.path.basename(fastq_ln)))
                else:
                    print "Linking to %s" % fastq
                    if not dry_run:
                        bcf_utils.mklink(fastq_file,fastq_ln,relative=True)
    else:
        # Merge files for replicates within each sample
        for sample in project.samples:
            replicates = {}
            # Gather replicates to be merged
            for fastq in sample.fastq:
                fastq_data = IlluminaData.IlluminaFastq(fastq)
                name = "%s_%s_R%d" % (fastq_data.sample_name,
                                      fastq_data.barcode_sequence,
                                      fastq_data.read_number)
                if name not in replicates:
                    replicates[name] = []
                replicates[name].append(os.path.join(sample.dirn,fastq))
                # Sort into order
                replicates[name].sort()
            # Report detected replicates
            print "Sample %s" % sample.name
            for name in replicates:
                print "\tReplicate '%s'" % name
                for fastq in replicates[name]:
                    print "\t\t%s" % fastq
            # Do the merge
            for name in replicates:
                merged_fastq = os.path.join(project_dir,name+'.fastq')
                bcf_utils.concatenate_fastq_files(merged_fastq,replicates[name])
    # Return directory name
    return project_dir
Beispiel #2
0
                               "if required)")
 p.add_option_group(deprecated_options)
 # Process command line
 options,args = p.parse_args()
 if len(args) != 1:
     p.error("input is a single SampleSheet.csv file")
 if options.miseq:
     logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets "
                     "are now converted automatically")
 # Get input sample sheet file
 samplesheet = args[0]
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the data as CSV
 data = IlluminaData.get_casava_sample_sheet(samplesheet)
 # Remove lanes
 if options.lanes is not None:
     lanes = parse_lane_expression(options.lanes)
     print "Keeping lanes %s, removing the rest" % ','.join([str(x) for x in lanes])
     new_data = IlluminaData.CasavaSampleSheet()
     for line in data:
         if line['Lane'] in lanes:
             print "Keeping %s" % line
             new_data.append(tabdata="%s" % line)
     data = new_data
 # Update the SampleID and SampleProject fields
 for sample_id in options.sample_id:
     lanes,name = parse_name_expression(sample_id)
     for line in data:
         if line['Lane'] in lanes:
Beispiel #3
0
                 "'NY_ChIP-seq'. Use multiple --expt=... to set the types for different "
                 "projects")
    p.add_option("--keep-names",action="store_true",dest="keep_names",default=False,
                 help="preserve the full names of the source fastq files when creating links")
    p.add_option("--merge-replicates",action="store_true",dest="merge_replicates",default=False,
                 help="create merged fastq files for each set of replicates detected")
    # Parse command line
    options,args = p.parse_args()

    # Get data directory name
    if len(args) != 1:
        p.error("expected one argument (location of Illumina analysis dir)")
    illumina_analysis_dir = os.path.abspath(args[0])

    # Populate Illumina data object
    illumina_data = IlluminaData.IlluminaData(illumina_analysis_dir,
                                              unaligned_dir=options.unaligned_dir)

    # Assign experiment types
    for expt in options.expt_type:
        name,type_ = expt.split(':')
        illumina_data.get_project(name).expt_type = type_

    # Create and populate per-project directory structure
    for project in illumina_data.projects:
        create_analysis_dir(project,
                            top_dir=illumina_analysis_dir,
                            merge_replicates=options.merge_replicates,
                            keep_names=options.keep_names,
                            dry_run=options.dry_run)

                               "if required)")
 p.add_option_group(deprecated_options)
 # Process command line
 options,args = p.parse_args()
 if len(args) != 1:
     p.error("input is a single SampleSheet.csv file")
 if options.miseq:
     logging.warning("--miseq option no longer necessary; MiSEQ-style sample sheets "
                     "are now converted automatically")
 # Get input sample sheet file
 samplesheet = args[0]
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the data as CSV
 data = IlluminaData.get_casava_sample_sheet(samplesheet)
 # Update the SampleID and SampleProject fields
 for sample_id in options.sample_id:
     lanes,name = parse_name_expression(sample_id)
     for line in data:
         if line['Lane'] in lanes:
             print "Setting SampleID for lane %d: '%s'" % (line['Lane'],name)
             line['SampleID'] = name
 # Update the SampleProject field
 for sample_project in options.sample_project:
     lanes,name = parse_name_expression(sample_project)
     for line in data:
         if line['Lane'] in lanes:
             print "Setting SampleProject for lane %d: '%s'" % (line['Lane'],name)
             line['SampleProject'] = name
 # Fix spaces
 p.add_option('--ignore-warnings',action="store_true",dest="ignore_warnings",default=False,
              help="ignore warnings about spaces and duplicated sampleID/sampleProject "
              "combinations when writing new samplesheet.csv file")
 # Process command line
 options,args = p.parse_args()
 if len(args) != 1:
     p.error("input is a single SampleSheet.csv file")
 # Get input sample sheet file
 samplesheet = args[0]
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the data as CSV
 if options.miseq:
     # Input sample sheet is from MiSEQ
     data = IlluminaData.convert_miseq_samplesheet_to_casava(samplesheet)
 else:
     # Standard CASAVA sample sheet
     data = IlluminaData.CasavaSampleSheet(samplesheet)
 # Update the SampleID and SampleProject fields
 for sample_id in options.sample_id:
     lanes,name = parse_name_expression(sample_id)
     for line in data:
         if line['Lane'] in lanes:
             print "Setting SampleID for lane %d: '%s'" % (line['Lane'],name)
             line['SampleID'] = name
 # Update the SampleProject field
 for sample_project in options.sample_project:
     lanes,name = parse_name_expression(sample_project)
     for line in data:
         if line['Lane'] in lanes:
Beispiel #6
0
def demultiplex_fastq(fastq_file, barcodes, nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'
    reads.

    Arguments:
      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

    Returns:
      No return value
    """
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
            continue
        local_barcodes.append(barcode)
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (
            barcode['name'], barcode['index'], info.lane_number,
            info.read_number, info.set_number)
        print "\t%s\t%s" % (barcode['index'], output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
            sys.exit(1)
        output_files[barcode['index']] = open(output_file_name, 'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
        return
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (
        info.lane_number, info.read_number, info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
        sys.exit(1)
    output_files['unbinned'] = open(unbinned_file_name, 'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode, nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read) + '\n')
                matched_read = True
                break
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read) + '\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
        output_files[barcode['index']].close()
    print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))
Beispiel #7
0
    # Set up barcode data
    barcodes = []
    for barcode_info in options.barcode_info:
        name, barcode, lane = barcode_info.split(':')
        print "Assigning barcode '%s' in lane %s to %s" % (barcode, lane, name)
        barcodes.append({
            'name': name,
            'index': barcode,
            'matcher': BarcodeMatcher(barcode),
            'lane': int(lane)
        })

    # Read from sample sheet (if supplied)
    if options.sample_sheet is not None:
        print "Reading data from sample sheet %s" % options.sample_sheet
        sample_sheet = IlluminaData.CasavaSampleSheet(options.sample_sheet)
        for line in sample_sheet:
            name = line['SampleID']
            barcode = line['Index'].rstrip('N').rstrip('-').rstrip('N')
            lane = line['Lane']
            print "Assigning barcode '%s' in lane %s to %s" % (barcode, lane,
                                                               name)
            barcodes.append({
                'name': name,
                'index': barcode,
                'matcher': BarcodeMatcher(barcode),
                'lane': int(lane)
            })
    if len(barcodes) < 1:
        p.error("need at least one --barcode and/or --samplesheet assignment")
                else:
                    n_fastqs = len(sample.fastq)
                    if n_fastqs == 1:
                        print "\t%s" % sample.name
                    else:
                        print "\t%s (%d fastqs)" % (sample.name,n_fastqs)
                # Print fastq names
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary: