예제 #1
0
class ICell8FastqIterator(Iterator):
    """
    Class for iterating over an iCell8 R1/R2 FASTQ-pair

    The iterator returns a set of ICell8ReadPair
    instances, for example:

    >>> for pair in ICell8FastqIterator(fq1,fq2):
    >>>   print "-- R1: %s" % pair.r1
    >>>   print "   R2: %s" % pair.r2
    """
    def __init__(self,fqr1,fqr2):
        """
        Create a new ICell8FastqIterator instance

        Arguments:
          fqr1 (str): path to the R1 FASTQ file
          fqr2 (str): path to the R2 FASTQ
        """
        self._read_count = 0
        self._fqr1 = FastqIterator(fqr1)
        self._fqr2 = FastqIterator(fqr2)
    def next(self):
        self._read_count += 1
        r1 = self._fqr1.next()
        r2 = self._fqr2.next()
        try:
            return ICell8ReadPair(r1,r2)
        except Exception as ex:
            print "Failed to create read pair:"
            print "-- Read pair number: %d" % self._read_count
            print "-- Read 1:\n%s" % r1
            print "-- Read 2:\n%s" % r2
            logging.critical("Failed to create read pair: %s" % ex)
            raise ex
예제 #2
0
    def from_fastq(self, fastq):
        """
        Get statistics from a FASTQ file

        Generates and stores statistics from a FASTQ file.

        Arguments:
          fastq (str): path to a FASTQ file (can be gzipped)

        """
        # Initialise using first read for sequence length
        quality_per_base = []
        for read in FastqIterator(fastq):
            for i in xrange(read.seqlen):
                quality_per_base.append({})
                for j in xrange(ord('!'), ord('I') + 1):
                    quality_per_base[i][chr(j)] = 0
            break

        # Iterate through fastq file and count quality scores
        nreads = 0
        for read in FastqIterator(fastq):
            nreads += 1
            for pos, q in enumerate(read.quality):
                quality_per_base[pos][q] += 1
        #print quality_per_base

        # Median etc positions
        # FIXME these are not correct if the list has an odd number of values!
        median_pos = nreads / 2
        q25_pos = median_pos / 2
        q75_pos = median_pos + q25_pos
        # For percentiles see http://stackoverflow.com/a/2753343/579925
        # FIXME 10th/90th percentiles is a fudge here
        p10_pos = nreads / 10
        p90_pos = nreads - p10_pos

        # For each base position determine stats
        for pos, counts in enumerate(quality_per_base):
            #print "Position: %d" % pos
            # Expand to a list
            scores = ''
            for q in counts:
                scores += q * counts[q]
            # Sort into order
            scores = ''.join(sorted(scores))
            #print scores
            # Get the mean (scores are Phred+33 encoded)
            self.mean.append(
                float(sum([(ord(q) - 33) for q in scores])) / nreads)
            #print "Mean: %.2f" % self.mean[pos]
            # Get the median etc
            self.median.append(ord(scores[median_pos]) - 33)
            self.q25.append(ord(scores[q25_pos]) - 33)
            self.q75.append(ord(scores[q75_pos]) - 33)
            self.p10.append(ord(scores[p10_pos]) - 33)
            self.p90.append(ord(scores[p90_pos]) - 33)
    def __init__(self,fqr1,fqr2):
        """
        Create a new ICell8FastqIterator instance

        Arguments:
          fqr1 (str): path to the R1 FASTQ file
          fqr2 (str): path to the R2 FASTQ
        """
        self._read_count = 0
        self._fqr1 = FastqIterator(fqr1)
        self._fqr2 = FastqIterator(fqr2)
예제 #4
0
def collect_fastq_stats(fastq):
    """
    Get barcode and distince UMI counts for Fastq file

    Used by Icell8Stats to collect counts for each file
    supplied.

    Arguments:
      fastq (str): path to Fastq file

    Returns:
      Tuple: tuple consisting of (fastq,counts,umis)
        where 'fastq' is the path to the input Fastq
        file, 'counts' is a dictionary with barcodes
        as keys and read counts as values, and 'umis'
        is a dictionary with barcodes as keys and
        sets of UMIs as values.
    """
    counts = {}
    umis = {}
    for r in FastqIterator(fastq):
        r = ICell8Read1(r)
        barcode = r.barcode
        try:
            counts[barcode] += 1
        except KeyError:
            counts[barcode] = 1
        umi = r.umi
        try:
            umis[barcode].add(umi)
        except KeyError:
            umis[barcode] = set((umi, ))
    return (fastq, counts, umis)
def pair_fastqs(fastqs):
    """
    Automagically pair up FASTQ files

    Given a list of FASTQ files, generate a list of R1/R2
    pairs by examining the header for the first read in
    each file.

    Arguments:
      fastqs (list): list of paths to FASTQ files which
        will be paired.

    Returns:
      Tuple: pair of lists of the form (paired,unpaired),
        where `paired` is a list of tuples consisting of
        FASTQ R1/R2 pairs and `unpaired` is a list of
        FASTQs which couldn't be paired.
    """
    fq_pairs = []
    seq_ids = {}
    bad_files = []
    for fq in [os.path.abspath(fq) for fq in fastqs]:
        # Get header from first read
        seq_id = None
        with get_fastq_file_handle(fq) as fp:
            for r in FastqIterator(fp=fp):
                seq_id = r.seqid
                break
        if seq_id is None:
            logging.debug("'Bad' file: %s" % fq)
            bad_files.append(fq)
            continue
        fq_pair = None
        for fq1 in seq_ids:
            if seq_id.is_pair_of(seq_ids[fq1]):
                # Found a pair
                if seq_id.pair_id == '1':
                    fq_pair = (fq, fq1)
                else:
                    fq_pair = (fq1, fq)
                fq_pairs.append(fq_pair)
                logging.debug("*** Paired: %s\n" "          : %s" % fq_pair)
                # Remove paired fastq
                del (seq_ids[fq1])
                break
        if fq_pair is None:
            # Unable to pair, store for now
            logging.debug("Unpaired: %s" % fq)
            seq_ids[fq] = seq_id
    # Sort pairs into order
    fq_pairs = sorted(fq_pairs, key=lambda x: x[0])
    unpaired = sorted(list(seq_ids.keys()) + bad_files)
    # Return paired and upaired fastqs
    return (fq_pairs, unpaired)
    def collect_fastq_stats(self,fastq):
        """
        Get barcode and distinct UMI counts for Fastq file

        This method can be called directly, but is
        also invoked implicitly if its parent instance
        is called.

        Arguments:
          fastq (str): path to Fastq file

        Returns:
          Tuple: tuple consisting of (fastq,counts,umis)
            where 'fastq' is the path to the input Fastq
            file, 'counts' is a dictionary with barcodes
            as keys and read counts as values, and 'umis'
            is a dictionary with barcodes as keys and
            sets of UMIs as values.
        """
        print("collect_fastq_stats: started: %s" % fastq)
        try:
            n = FastqReadCounter.zcat_wc(fastq)
            print("%s: processing %d read%s" % (
                os.path.basename(fastq),
                n,('s' if n != 1 else '')))
            counts = {}
            umis = {}
            progress = ProgressChecker(percent=5,total=n)
            for i,r in enumerate(FastqIterator(fastq),start=1):
                r = ICell8Read1(r)
                barcode = r.barcode
                try:
                    counts[barcode] += 1
                except KeyError:
                    counts[barcode] = 1
                umi = r.umi
                try:
                    umis[barcode].add(umi)
                except KeyError:
                    umis[barcode] = set((umi,))
                if self._verbose:
                    if progress.check(i):
                        print("%s: %s: processed %d reads (%.1f%%)" %
                              (time.strftime("%Y%m%d.%H%M%S"),
                               os.path.basename(fastq),
                               i,progress.percent(i)))
        except Exception as ex:
            print("collect_fastq_stats: caught exception: '%s'" % ex)
            raise Exception("collect_fastq_stats: %s: caught exception "
                            "'%s'",(fastq,ex))
        print("collect_fastq_stats: returning: %s" % fastq)
        return (fastq,counts,umis)
예제 #7
0
def get_read_number(fastq):
    """
    Get the read number (1 or 2) from a Fastq file

    Arguments:
      fastq (str): path to a Fastq file

    Returns:
      Integer: read number (1 or 2) extracted from the first read.
    """
    for r in FastqIterator(fastq):
        seq_id = r.seqid
        break
    return int(seq_id.pair_id)
예제 #8
0
def count_barcodes(fastqs):
    """
    Count the barcodes from multiple fastqs

    """
    print "Reading in %s fastq%s" % (len(fastqs),
                                     ('' if len(fastqs) == 1 else 's'))
    counts = BarcodeCounter()
    for fq in fastqs:
        print "%s" % os.path.basename(fq)
        for r in FastqIterator(fq):
            seq = r.seqid.index_sequence
            lane = int(r.seqid.flowcell_lane)
            counts.count_barcode(seq, lane)
    return counts
예제 #9
0
    def fastqiterator(fastq=None, fp=None):
        """
        Return number of reads in a FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Number of reads

        """
        nreads = 0
        for r in FastqIterator(fastq_file=fastq, fp=fp):
            nreads += 1
        return nreads
def assign_barcodes_single_end(fastq_in, fastq_out, n=5):
    """
    Extract inline barcodes and assign to Fastq read headers

    Strips the first n bases from each read of the input
    FASTQ file and assigns it to the index sequence for that
    read in the output file.

    If the supplied output file name ends with '.gz' then it
    will be gzipped.

    Arguments:
      fastq_in (str): input FASTQ file (can be gzipped)
      fastq_out (str): output FASTQ file (will be gzipped if
        ending with '.gz')
      n (integer): number of bases to extract and assign as
        index sequence (default: 5)

    Returns:
      Integer: number of reads processed.

    """
    if fastq_out.endswith('.gz'):
        fp = gzip.GzipFile(filename=fastq_out, mode='wb')
    else:
        fp = open(fastq_out, 'w')
    print("Processing reads from %s" % fastq_in)
    nread = 0
    for read in FastqIterator(fastq_in):
        # Extract new barcode sequence
        barcode = read.sequence[:n]
        # Truncate sequence and quality accordingly
        sequence = read.sequence[n:]
        quality = read.quality[n:]
        # Assign new values and write to output
        read.seqid.index_sequence = barcode
        read.sequence = sequence
        read.quality = quality
        fp.write("%s\n" % read)
        nread += 1
    fp.close()
    print("Finished (%d reads processed)" % nread)
    return nread
예제 #11
0
    def reads_per_lane(fastq=None, fp=None):
        """
        Return counts of reads in each lane of FASTQ file

        Uses the FASTQFile.FastqIterator class to do the
        counting, with counts split by lane.

        Arguments:
          fastq: fastq(.gz) file
          fp: open file descriptor for fastq file

        Returns:
          Dictionary where keys are lane numbers (as integers)
            and values are number of reads in that lane.

        """
        nreads = {}
        for r in FastqIterator(fastq_file=fastq, fp=fp):
            lane = int(r.seqid.flowcell_lane)
            try:
                nreads[lane] += 1
            except KeyError:
                nreads[lane] = 1
        return nreads
예제 #12
0
    ndatabases_mammalian = len(nohits_mammalian)
    ndatabases_contaminants = len(nohits_contaminants)
    print "'nohits' tags: '%s' and '%s'" % (nohits_mammalian,
                                            nohits_contaminants)
    # Output filtered FASTQ pair
    fqr1_out = os.path.basename(strip_ext(fqr1,'.fastq')) \
               + '.filtered.fastq'
    fqr2_out = os.path.basename(strip_ext(fqr2,'.fastq')) \
               + '.filtered.fastq'
    output_fqs = OutputFiles(base_dir=out_dir)
    output_fqs.open('fqr1', fqr1_out)
    output_fqs.open('fqr2', fqr2_out)

    # Filter the iCell8 read pairs against the tagged reads
    for pair, pref, contam in izip(ICell8FastqIterator(fqr1, fqr2),
                                   FastqIterator(mammalian_tagged_fq),
                                   FastqIterator(contaminants_tagged_fq)):
        # Get the tags
        pref_tag = extract_fastq_screen_tag(pref)
        contam_tag = extract_fastq_screen_tag(contam)
        # Check number of databases are consistent
        if len(pref_tag) != ndatabases_mammalian:
            logging.critical("Mismatch in mammalian tag: "
                             "len('%s') != len('%s')" %
                             (pref_tag, nohits_mammalian))
            sys.exit(1)
        if len(contam_tag) != ndatabases_contaminants:
            logging.critical("Mismatch in contaminant tag: "
                             "len('%s') != len('%s')" %
                             (contam_tag, nohits_contaminants))
            sys.exit(1)
def get_sequence_lengths(fastq, outfile=None, show_progress=False, limit=None):
    """
    Get sequence lengths and masking statistics for Fastq

    Returns a dictionary with the following keys:

    - fastq: the Fastq file that metrics were calculated
      from
    - nreads: total number of reads processed
    - nreads_masked: number of reads that are completely
      masked (i.e. consist only of 'N's)
    - nreads_padded: number of partially masked reads
      (i.e. contain trailing 'N's)
    - frac_reads_masked: fraction of the processed reads
      which are masked
    - frac_reads_padded: fraction of the processed reads
      which are padded
    - min_length: minimum read length
    - max_length: maximum read length
    - mean_length: mean read length
    - median_length: median read length
    - seq_lengths_dist: distribution of lengths for all
      reads
    - seq_lengths_masked_dist: distribution of lengths
      for masked reads
    - seq_lengths_padded_dist: distribution of lengths
      for padded reads

    The distributions are each themselves dictionaries
    where the keys are read lengths and the values are
    the number of reads with the matching length; note
    that only lengths with a non-zero number of reads
    are included (zeroes are implied for all other
    lengths).

    Arguments:
      fastq (str): path to Fastq file
      outfile (str): optional, path to output JSON file
      show_progress (bool): if True then print message
        to stdout every 100000 reads indicating progress
        (default: operate silently)
      limit (int): if set then only process this number
        of reads from the head of the Fastq and return
        stats based on these (default: process all reads
        in the file)

    Returns:
      Dictionary: containing the metrics for the Fastq.
    """
    nreads = 0
    nreads_masked = 0
    nreads_padded = 0
    sequence_length = dict()
    reads_all_n = dict()
    reads_padded = dict()
    if show_progress:
        print("\n%s" % fastq)
    for r in FastqIterator(fastq):
        nreads += 1
        if show_progress and nreads % 100000 == 0:
            print("...%d reads" % nreads)
        if limit and nreads == limit:
            logging.info("Stopping at limit: %d" % limit)
            break
        seqlen = r.seqlen
        try:
            sequence_length[seqlen] += 1
        except KeyError:
            sequence_length[seqlen] = 1
        nns = 0
        seqlen_no_ns = len(r.sequence.strip('N'))
        if seqlen_no_ns == 0:
            nreads_masked += 1
            try:
                reads_all_n[seqlen] += 1
            except KeyError:
                reads_all_n[seqlen] = 1
        elif seqlen_no_ns < seqlen:
            nreads_padded += 1
            try:
                reads_padded[seqlen] += 1
            except KeyError:
                reads_padded[seqlen] = 1
    # Get statistics
    seqlens = sorted(sequence_length.keys())
    min_len = min(seqlens)
    max_len = max(seqlens)
    mean_len = float(sum([l*sequence_length[l] for l in seqlens]))\
               /float(nreads)
    median_read = nreads // 2
    median_len = None
    read_count = 0
    for l in seqlens:
        read_count += sequence_length[l]
        if read_count >= median_read:
            median_len = l
            break
    frac_reads_masked = float(nreads_masked) / nreads * 100.0
    frac_reads_padded = float(nreads_padded) / nreads * 100.0
    # Build dictionary for output
    stats = dict(fastq=fastq,
                 nreads=nreads,
                 nreads_masked=nreads_masked,
                 nreads_padded=nreads_padded,
                 frac_reads_masked=frac_reads_masked,
                 frac_reads_padded=frac_reads_padded,
                 min_length=min_len,
                 max_length=max_len,
                 mean_length=mean_len,
                 median_length=median_len,
                 seq_lengths_dist=sequence_length,
                 seq_lengths_masked_dist=reads_all_n,
                 seq_lengths_padded_dist=reads_padded)
    # Output to file
    if outfile:
        with open(outfile, 'wt') as fp:
            json.dump(stats, fp, sort_keys=True, indent=4)
    # Return stats
    return stats