def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None):
    """
    Split reads from paired end data

    For each fastq file pair in 'fastqs', check reads against the
    index sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    Arguments:
      matcher (BarcodeMatcher): barcoder matcher instance
      fastqs (list): list of Fastq pairs to split
      base_name (str): optional, base name to use for output
        Fastq files
      output_dir (str): optional, path to directory to write
        output Fastqs to

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode))
        fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode))
    fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name)
    fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name)
    # Filter reads
    nread = 0
    for fq_r1, fq_r2 in fastq_pairs:
        print("Processing reads from fastq pair %s %s" % (fq_r1, fq_r2))
        for read1, read2 in zip(FASTQFile.FastqIterator(fq_r1),
                                FASTQFile.FastqIterator(fq_r2)):
            nread += 1
            seq = read1.seqid.index_sequence
            if not seq:
                raise Exception("%s: no index sequence for read %d" %
                                (fq_r1, nread))
            if seq != read2.seqid.index_sequence:
                raise Exception("Index sequence mismatch between R1 and "
                                "R2 reads")
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write((assigned_index, 'R1'), read1)
            fp.write((assigned_index, 'R2'), read2)
    print("Finished (%d read pairs processed)" % nread)
예제 #2
0
def count_barcodes_for_file(fastq):
    """Count the index sequences across a single Fastq file

    Arguments:
      fastq: Fastq file to read barcodes from

    Returns:
      'counts' dictionary where counts[SEQ] holds the number of
      times index sequence SEQ occurs.

    """
    counts = dict()
    nreads = 0
    print "Reading in data from %s" % fastq
    for read in FASTQFile.FastqIterator(fastq):
        seq = read.seqid.index_sequence
        if not seq:
            raise ValueError,"No index sequence for read! %s" % read
        # Check if we've already encountered this sequence
        if seq in counts:
            # Already seen
            counts[seq] += 1
        else:
            # Novel sequence
            counts[seq] = 1
    # Return the counts dictionary
    return counts
예제 #3
0
def stats(fastq_file):
    """Generate basic stats from FASTQ file
    """
    # Loop over all reads in the FASTQ
    n_reads = 0
    read_lengths = {}
    index_sequences = {}
    for read in FASTQFile.FastqIterator(fastq_file):
        # Count of reads
        n_reads += 1
        # Read length distribution
        read_len = len(read.sequence)
        if read_len in read_lengths:
            read_lengths[read_len] += 1
        else:
            read_lengths[read_len] = 1
        # Tag name distribution
        index_seq = read.seqid.index_sequence
        if index_seq is not None:
            if index_seq in index_sequences:
                index_sequences[index_seq] += 1
            else:
                index_sequences[index_seq] = 1
    # Finished
    print "Total reads: %d" % n_reads
    print "Read lengths"
    for len_ in read_lengths:
        print "\t%d: %d" % (len_, read_lengths[len_])
    print "Index sequences"
    for seq in index_sequences:
        print "\t%s: %d" % (seq, index_sequences[seq])
예제 #4
0
def split_single_end(matcher, fastqs, base_name=None, output_dir=None):
    """Split reads from single ended data

    For each fastq file in 'fastqs', check reads against the index
    sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open(barcode, "%s%s.fastq" % (base_name, barcode))
    fp.open('undetermined', "%sundetermined.fastq" % base_name)
    # Filter reads
    nread = 0
    for fastq in fastqs:
        print "Processing reads from %s" % fastq
        for read in FASTQFile.FastqIterator(fastq):
            nread += 1
            seq = read.seqid.index_sequence
            if not seq:
                logging.error("No index sequence for read!")
                sys.exit(1)
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write(assigned_index, read)
    print "Finished (%d reads processed)" % nread
예제 #5
0
def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None):
    """Split reads from paired end data

    For each fastq file pair in 'fastqs', check reads against the
    index sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode))
        fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode))
    fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name)
    fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name)
    # Filter reads
    nread = 0
    for fq_r1, fq_r2 in fastq_pairs:
        print "Processing reads from fastq pair %s %s" % (fq_r1, fq_r2)
        for read1, read2 in itertools.izip(FASTQFile.FastqIterator(fq_r1),
                                           FASTQFile.FastqIterator(fq_r2)):
            nread += 1
            seq = read1.seqid.index_sequence
            if not seq:
                logging.error("No index sequence for read!")
                sys.exit(1)
            if seq != read2.seqid.index_sequence:
                raise Exception, "Index sequence mismatch between R1 and R2 reads"
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write((assigned_index, 'R1'), read1)
            fp.write((assigned_index, 'R2'), read2)
    print "Finished (%d read pairs processed)" % nread
예제 #6
0
def edit_instrument_name(fastq_file, new_instrument_name):
    """Edit the instrument name for all records in FASTQ file

    Loop over all records in a supplied FASTQ file, update the sequence identifier
    (i.e. first line in the each record) by changing the instrument name, and write
    the updated records to stdout.
    """
    # Loop over all reads in the FASTQ
    # Update the instrument name in the sequence identifier and echo to stdout
    for read in FASTQFile.FastqIterator(fastq_file):
        if new_instrument_name:
            # Modify the instrument name
            read.seqid.instrument_name = new_instrument_name
        # Echo updated read to stdout
        print read
예제 #7
0
    def load(self, fastq=None, fp=None):
        """Read in fastq data and collect index sequence info

        The input FASTQ can be either a text file or a compressed (gzipped)
        FASTQ, specified via a file name (using the 'fastq' argument), or a
        file-like object opened for line reading (using the 'fp' argument).

        Arguments:
           fastq_file: name of the FASTQ file to iterate through
           fp: file-like object opened for reading

        """
        for read in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            seq = read.seqid.index_sequence
            if seq not in self._counts:
                self._counts[seq] = 1
            else:
                self._counts[seq] += 1
예제 #8
0
    def fastqiterator(fastq=None, fp=None):
        """
        Return number of reads in a FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Number of reads

        """
        nreads = 0
        for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            nreads += 1
        return nreads
def split_single_end(matcher, fastqs, base_name=None, output_dir=None):
    """
    Split reads from single ended data

    For each fastq file in 'fastqs', check reads against the index
    sequences in the BarcodeMatcher 'matcher' and write to an
    appropriate file.

    Arguments:
      matcher (BarcodeMatcher): barcoder matcher instance
      fastqs (list): list of Fastqs to split
      base_name (str): optional, base name to use for output
        Fastq files
      output_dir (str): optional, path to directory to write
        output Fastqs to

    """
    if base_name is None:
        base_name = ''
    else:
        base_name = "%s." % base_name
    fp = OutputFiles(base_dir=output_dir)
    for barcode in matcher.sequences:
        fp.open(barcode, "%s%s.fastq" % (base_name, barcode))
    fp.open('undetermined', "%sundetermined.fastq" % base_name)
    # Filter reads
    nread = 0
    for fastq in fastqs:
        print("Processing reads from %s" % fastq)
        for read in FASTQFile.FastqIterator(fastq):
            nread += 1
            seq = read.seqid.index_sequence
            if not seq:
                raise Exception("%s: no index sequence for read %d" %
                                (fastq, nread))
            assigned_index = matcher.match(seq)
            # Read not assigned
            if assigned_index is None:
                assigned_index = 'undetermined'
            logging.debug("Assigned read #%d to %s" % (nread, assigned_index))
            fp.write(assigned_index, read)
    print("Finished (%d reads processed)" % nread)
예제 #10
0
    def reads_per_lane(fastq=None, fp=None):
        """
        Return counts of reads in each lane of FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting, with counts split by lane.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Dictionary where keys are lane numbers (as integers)
            and values are number of reads in that lane.

        """
        nreads = {}
        for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp):
            lane = int(r.seqid.flowcell_lane)
            try:
                nreads[lane] += 1
            except KeyError:
                nreads[lane] = 1
        return nreads
예제 #11
0
# Main program
if __name__ == "__main__":
    # Collect input fastq file name
    if len(sys.argv) < 2:
        print("Usage: %s fastq" % os.path.basename(sys.argv[0]))
        sys.exit()
    fastq = sys.argv[1]
    # Output file names
    fastq_out = fastq + ".paired"
    singles_header = fastq + ".single.header"
    pairs_header = fastq + ".pair.header"
    # Loop over file and collect read names
    headers = set()
    pairs = set()
    n = 1
    for read in FASTQFile.FastqIterator(fastq):
        seqid = str(read.seqid)
        if seqid in headers:
            # Part of a pair
            pairs.add(seqid)
        else:
            headers.add(seqid)
        n += 1
        if not (n % 1000000): print("%s" % n)
    # Loop again outputing only paired reads
    fp = io.open(fastq_out, 'wt')
    fp_singles = io.open(singles_header, 'wt')
    fp_pairs = io.open(pairs_header, 'wt')
    n = 1
    for read in FASTQFile.FastqIterator(fastq):
        seqid = str(read.seqid)
def demultiplex_fastq(fastq_file,barcodes,nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'
    reads.

    Arguments:
      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

    Returns:
      No return value
    """
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
            continue
        local_barcodes.append(barcode)
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'],
                                                           barcode['index'],
                                                           info.lane_number,
                                                           info.read_number,
                                                           info.set_number)
        print "\t%s\t%s" % (barcode['index'],output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
            sys.exit(1)
        output_files[barcode['index']] = open(output_file_name,'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
        return
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number,
                                                            info.read_number,
                                                            info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
        sys.exit(1)
    output_files['unbinned'] = open(unbinned_file_name,'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode,nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read)+'\n')
                matched_read = True
                break
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read)+'\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
        output_files[barcode['index']].close()
    print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))