def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None):
    """
    Split reads from paired end data

    For each fastq file pair in 'fastqs', check reads against the
    index sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    Arguments:
      matcher (BarcodeMatcher): barcoder matcher instance
      fastqs (list): list of Fastq pairs to split
      base_name (str): optional, base name to use for output
        Fastq files
      output_dir (str): optional, path to directory to write
        output Fastqs to

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode))
        fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode))
    fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name)
    fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name)
    # Filter reads
    nread = 0
    for fq_r1, fq_r2 in fastq_pairs:
        print("Processing reads from fastq pair %s %s" % (fq_r1, fq_r2))
        for read1, read2 in zip(FASTQFile.FastqIterator(fq_r1),
                                FASTQFile.FastqIterator(fq_r2)):
            nread += 1
            seq = read1.seqid.index_sequence
            if not seq:
                raise Exception("%s: no index sequence for read %d" %
                                (fq_r1, nread))
            if seq != read2.seqid.index_sequence:
                raise Exception("Index sequence mismatch between R1 and "
                                "R2 reads")
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write((assigned_index, 'R1'), read1)
            fp.write((assigned_index, 'R2'), read2)
    print("Finished (%d read pairs processed)" % nread)
Example #2
0
def stats(fastq_file):
    """Generate basic stats from FASTQ file
    """
    # Loop over all reads in the FASTQ
    n_reads = 0
    read_lengths = {}
    index_sequences = {}
    for read in FASTQFile.FastqIterator(fastq_file):
        # Count of reads
        n_reads += 1
        # Read length distribution
        read_len = len(read.sequence)
        if read_len in read_lengths:
            read_lengths[read_len] += 1
        else:
            read_lengths[read_len] = 1
        # Tag name distribution
        index_seq = read.seqid.index_sequence
        if index_seq is not None:
            if index_seq in index_sequences:
                index_sequences[index_seq] += 1
            else:
                index_sequences[index_seq] = 1
    # Finished
    print "Total reads: %d" % n_reads
    print "Read lengths"
    for len_ in read_lengths:
        print "\t%d: %d" % (len_, read_lengths[len_])
    print "Index sequences"
    for seq in index_sequences:
        print "\t%s: %d" % (seq, index_sequences[seq])
def split_single_end(matcher, fastqs, base_name=None, output_dir=None):
    """Split reads from single ended data

    For each fastq file in 'fastqs', check reads against the index
    sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open(barcode, "%s%s.fastq" % (base_name, barcode))
    fp.open('undetermined', "%sundetermined.fastq" % base_name)
    # Filter reads
    nread = 0
    for fastq in fastqs:
        print "Processing reads from %s" % fastq
        for read in FASTQFile.FastqIterator(fastq):
            nread += 1
            seq = read.seqid.index_sequence
            if not seq:
                logging.error("No index sequence for read!")
                sys.exit(1)
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write(assigned_index, read)
    print "Finished (%d reads processed)" % nread
def count_barcodes_for_file(fastq):
    """Count the index sequences across a single Fastq file

    Arguments:
      fastq: Fastq file to read barcodes from

    Returns:
      'counts' dictionary where counts[SEQ] holds the number of
      times index sequence SEQ occurs.

    """
    counts = dict()
    nreads = 0
    print "Reading in data from %s" % fastq
    for read in FASTQFile.FastqIterator(fastq):
        seq = read.seqid.index_sequence
        if not seq:
            raise ValueError,"No index sequence for read! %s" % read
        # Check if we've already encountered this sequence
        if seq in counts:
            # Already seen
            counts[seq] += 1
        else:
            # Novel sequence
            counts[seq] = 1
    # Return the counts dictionary
    return counts
def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None):
    """Split reads from paired end data

    For each fastq file pair in 'fastqs', check reads against the
    index sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode))
        fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode))
    fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name)
    fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name)
    # Filter reads
    nread = 0
    for fq_r1, fq_r2 in fastq_pairs:
        print "Processing reads from fastq pair %s %s" % (fq_r1, fq_r2)
        for read1, read2 in itertools.izip(FASTQFile.FastqIterator(fq_r1),
                                           FASTQFile.FastqIterator(fq_r2)):
            nread += 1
            seq = read1.seqid.index_sequence
            if not seq:
                logging.error("No index sequence for read!")
                sys.exit(1)
            if seq != read2.seqid.index_sequence:
                raise Exception, "Index sequence mismatch between R1 and R2 reads"
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write((assigned_index, 'R1'), read1)
            fp.write((assigned_index, 'R2'), read2)
    print "Finished (%d read pairs processed)" % nread
Example #6
0
def edit_instrument_name(fastq_file, new_instrument_name):
    """Edit the instrument name for all records in FASTQ file

    Loop over all records in a supplied FASTQ file, update the sequence identifier
    (i.e. first line in the each record) by changing the instrument name, and write
    the updated records to stdout.
    """
    # Loop over all reads in the FASTQ
    # Update the instrument name in the sequence identifier and echo to stdout
    for read in FASTQFile.FastqIterator(fastq_file):
        if new_instrument_name:
            # Modify the instrument name
            read.seqid.instrument_name = new_instrument_name
        # Echo updated read to stdout
        print read
Example #7
0
    def simple(fastq=None, fp=None):
        """
        Return number of reads in a FASTQ file

        Uses the FASTQFile.nreads function to do the counting.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Number of reads

        """
        return FASTQFile.nreads(fastq=fastq, fp=fp)
Example #8
0
    def load(self, fastq=None, fp=None):
        """Read in fastq data and collect index sequence info

        The input FASTQ can be either a text file or a compressed (gzipped)
        FASTQ, specified via a file name (using the 'fastq' argument), or a
        file-like object opened for line reading (using the 'fp' argument).

        Arguments:
           fastq_file: name of the FASTQ file to iterate through
           fp: file-like object opened for reading

        """
        for read in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            seq = read.seqid.index_sequence
            if seq not in self._counts:
                self._counts[seq] = 1
            else:
                self._counts[seq] += 1
Example #9
0
    def fastqiterator(fastq=None, fp=None):
        """
        Return number of reads in a FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Number of reads

        """
        nreads = 0
        for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            nreads += 1
        return nreads
def split_single_end(matcher, fastqs, base_name=None, output_dir=None):
    """
    Split reads from single ended data

    For each fastq file in 'fastqs', check reads against the index
    sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    Arguments:
      matcher (BarcodeMatcher): barcoder matcher instance
      fastqs (list): list of Fastqs to split
      base_name (str): optional, base name to use for output
        Fastq files
      output_dir (str): optional, path to directory to write
        output Fastqs to

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open(barcode, "%s%s.fastq" % (base_name, barcode))
    fp.open('undetermined', "%sundetermined.fastq" % base_name)
    # Filter reads
    nread = 0
    for fastq in fastqs:
        print("Processing reads from %s" % fastq)
        for read in FASTQFile.FastqIterator(fastq):
            nread += 1
            seq = read.seqid.index_sequence
            if not seq:
                raise Exception("%s: no index sequence for read %d" %
                                (fastq, nread))
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write(assigned_index, read)
    print("Finished (%d reads processed)" % nread)
Example #11
0
    def reads_per_lane(fastq=None, fp=None):
        """
        Return counts of reads in each lane of FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting, with counts split by lane.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Dictionary where keys are lane numbers (as integers)
            and values are number of reads in that lane.

        """
        nreads = {}
        for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            lane = int(r.seqid.flowcell_lane)
            try:
                nreads[lane] += 1
            except KeyError:
                nreads[lane] = 1
        return nreads
#######################################################################
# Main program
#######################################################################

if __name__ == "__main__":
    
    # Create command line parser
    p = optparse.OptionParser(usage="%prog OPTIONS R1.fastq R2.fastq",
                              version="%prog "+__version__,
                              description="Check that read headers for R1 and R2 fastq files "
                              "are in agreement, and that the files form an R1/2 pair.")
    # Parse command line
    options,args = p.parse_args()
    # Get data directory name
    if len(args) != 2:
        p.error("expected two arguments (R1 and R2 fastq files to compare)")
    fastq_file_r1 = args[0]
    fastq_file_r2 = args[1]
    # Process the data
    if FASTQFile.fastqs_are_pair(fastq_file_r1,fastq_file_r2):
        sys.exit(0)
    else:
        logging.error("Not R1/R2 pair")
        sys.exit(1)
        
        
        
            
        
    
def demultiplex_fastq(fastq_file,barcodes,nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'
    reads.

    Arguments:
      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

    Returns:
      No return value
    """
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
            continue
        local_barcodes.append(barcode)
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'],
                                                           barcode['index'],
                                                           info.lane_number,
                                                           info.read_number,
                                                           info.set_number)
        print "\t%s\t%s" % (barcode['index'],output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
            sys.exit(1)
        output_files[barcode['index']] = open(output_file_name,'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
        return
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number,
                                                            info.read_number,
                                                            info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
        sys.exit(1)
    output_files['unbinned'] = open(unbinned_file_name,'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode,nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read)+'\n')
                matched_read = True
                break
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read)+'\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
        output_files[barcode['index']].close()
    print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))
Example #14
0
#######################################################################
# Main program
#######################################################################

if __name__ == "__main__":
    
    # Create command line parser
    p = argparse.ArgumentParser(
        version="%(prog)s "+__version__,
        description="Check that read headers for R1 and R2 fastq files "
        "are in agreement, and that the files form an R1/2 pair.")
    p.add_argument('fastq_file_r1',metavar="R1.fastq",
                   help="Fastq file with R1 reads")
    p.add_argument('fastq_file_r2',metavar="R2.fastq",
                   help="Fastq file with R2 reads to check against "
                   "R1 reads")
    # Parse command line
    args = p.parse_args()
    # Process the data
    if FASTQFile.fastqs_are_pair(args.fastq_file_r1,args.fastq_file_r2):
        sys.exit(0)
    else:
        logging.error("Not R1/R2 pair")
        sys.exit(1)
        
        
        
            
        
    
def batch_fastqs(fastqs, nbatches, basename="batched", out_dir=None):
    """
    Splits reads from one or more Fastqs into batches

    Concatenates input Fastq files and then splits
    reads into smaller Fastqs using the external 'batch'
    utility.

    Arguments:
      fastqs (list): list of paths to one or more Fastq
        files to take reads from
      nbatches (int): number of batches to output reads
        into
      basename (str): optional basename to use for the
        output Fastq files (default: 'batched')
      out_dir (str): optional path to a directory where
        the batched Fastqs will be written
    """
    # Count the total number of reads
    print "Fetching read counts:"
    nreads = 0
    for fq in fastqs:
        n = FASTQFile.nreads(fq)
        print "%s:\t%d" % (os.path.basename(fq), n)
        nreads += n
    print "Total reads: %d" % nreads

    # Determine batch size
    batch_size = nreads / nbatches
    if nreads % batch_size:
        # Round up batch size
        batch_size += 1
    assert (batch_size * nbatches >= nreads)
    print "Creating batches of %d reads" % batch_size

    # Check if fastqs are compressed
    gzipped = fastqs[0].endswith('.gz')
    if gzipped:
        batch_cmd = Command('zcat')
    else:
        batch_cmd = Command('cat')

    # Get the read number
    read_number = get_read_number(fastqs[0])
    suffix = ".r%s.fastq" % read_number

    # Build and run the batching command
    batch_cmd.add_args(*fastqs)
    batch_cmd.add_args('|', 'split', '-l', batch_size * 4, '-d', '-a', 3,
                       '--additional-suffix=%s' % suffix, '-',
                       os.path.join(out_dir, "%s.B" % basename))
    batch_script = os.path.join(out_dir, "batch.sh")
    batch_cmd.make_wrapper_script("/bin/bash", batch_script)

    # Check for successful exit code
    retcode = Command("/bin/bash",
                      batch_script).run_subprocess(working_dir=out_dir)
    if retcode != 0:
        raise Exception("Batching failed: exit code %s" % retcode)

    # Collect and return the batched Fastq names
    batched_fastqs = [
        os.path.join(out_dir, "%s.B%03d%s" % (basename, i, suffix))
        for i in xrange(0, nbatches)
    ]
    return batched_fastqs
Example #16
0
# Main program
if __name__ == "__main__":
    # Collect input fastq file name
    if len(sys.argv) < 2:
        print("Usage: %s fastq" % os.path.basename(sys.argv[0]))
        sys.exit()
    fastq = sys.argv[1]
    # Output file names
    fastq_out = fastq + ".paired"
    singles_header = fastq + ".single.header"
    pairs_header = fastq + ".pair.header"
    # Loop over file and collect read names
    headers = set()
    pairs = set()
    n = 1
    for read in FASTQFile.FastqIterator(fastq):
        seqid = str(read.seqid)
        if seqid in headers:
            # Part of a pair
            pairs.add(seqid)
        else:
            headers.add(seqid)
        n += 1
        if not (n % 1000000): print("%s" % n)
    # Loop again outputing only paired reads
    fp = io.open(fastq_out, 'wt')
    fp_singles = io.open(singles_header, 'wt')
    fp_pairs = io.open(pairs_header, 'wt')
    n = 1
    for read in FASTQFile.FastqIterator(fastq):
        seqid = str(read.seqid)
Example #17
0
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn, fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (
                            fastq, bcf_utils.format_file_size(fsize), nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn, fastq)
Example #18
0
SHARE_DIR = os.path.abspath(
    os.path.normpath(os.path.join(os.path.dirname(sys.argv[0]), '..')))
sys.path.append(SHARE_DIR)
import bcftbx.FASTQFile as FASTQFile

#######################################################################
# Main program
#######################################################################

if __name__ == "__main__":

    # Create command line parser
    p = optparse.OptionParser(
        usage="%prog OPTIONS R1.fastq R2.fastq",
        version="%prog " + __version__,
        description="Check that read headers for R1 and R2 fastq files "
        "are in agreement, and that the files form an R1/2 pair.")
    # Parse command line
    options, args = p.parse_args()
    # Get data directory name
    if len(args) != 2:
        p.error("expected two arguments (R1 and R2 fastq files to compare)")
    fastq_file_r1 = args[0]
    fastq_file_r2 = args[1]
    # Process the data
    if FASTQFile.fastqs_are_pair(fastq_file_r1, fastq_file_r2):
        sys.exit(0)
    else:
        logging.error("Not R1/R2 pair")
        sys.exit(1)
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq: