class ICell8FastqIterator(Iterator): """ Class for iterating over an iCell8 R1/R2 FASTQ-pair The iterator returns a set of ICell8ReadPair instances, for example: >>> for pair in ICell8FastqIterator(fq1,fq2): >>> print "-- R1: %s" % pair.r1 >>> print " R2: %s" % pair.r2 """ def __init__(self,fqr1,fqr2): """ Create a new ICell8FastqIterator instance Arguments: fqr1 (str): path to the R1 FASTQ file fqr2 (str): path to the R2 FASTQ """ self._read_count = 0 self._fqr1 = FastqIterator(fqr1) self._fqr2 = FastqIterator(fqr2) def next(self): self._read_count += 1 r1 = self._fqr1.next() r2 = self._fqr2.next() try: return ICell8ReadPair(r1,r2) except Exception as ex: print "Failed to create read pair:" print "-- Read pair number: %d" % self._read_count print "-- Read 1:\n%s" % r1 print "-- Read 2:\n%s" % r2 logging.critical("Failed to create read pair: %s" % ex) raise ex
def from_fastq(self, fastq): """ Get statistics from a FASTQ file Generates and stores statistics from a FASTQ file. Arguments: fastq (str): path to a FASTQ file (can be gzipped) """ # Initialise using first read for sequence length quality_per_base = [] for read in FastqIterator(fastq): for i in xrange(read.seqlen): quality_per_base.append({}) for j in xrange(ord('!'), ord('I') + 1): quality_per_base[i][chr(j)] = 0 break # Iterate through fastq file and count quality scores nreads = 0 for read in FastqIterator(fastq): nreads += 1 for pos, q in enumerate(read.quality): quality_per_base[pos][q] += 1 #print quality_per_base # Median etc positions # FIXME these are not correct if the list has an odd number of values! median_pos = nreads / 2 q25_pos = median_pos / 2 q75_pos = median_pos + q25_pos # For percentiles see http://stackoverflow.com/a/2753343/579925 # FIXME 10th/90th percentiles is a fudge here p10_pos = nreads / 10 p90_pos = nreads - p10_pos # For each base position determine stats for pos, counts in enumerate(quality_per_base): #print "Position: %d" % pos # Expand to a list scores = '' for q in counts: scores += q * counts[q] # Sort into order scores = ''.join(sorted(scores)) #print scores # Get the mean (scores are Phred+33 encoded) self.mean.append( float(sum([(ord(q) - 33) for q in scores])) / nreads) #print "Mean: %.2f" % self.mean[pos] # Get the median etc self.median.append(ord(scores[median_pos]) - 33) self.q25.append(ord(scores[q25_pos]) - 33) self.q75.append(ord(scores[q75_pos]) - 33) self.p10.append(ord(scores[p10_pos]) - 33) self.p90.append(ord(scores[p90_pos]) - 33)
def __init__(self,fqr1,fqr2): """ Create a new ICell8FastqIterator instance Arguments: fqr1 (str): path to the R1 FASTQ file fqr2 (str): path to the R2 FASTQ """ self._read_count = 0 self._fqr1 = FastqIterator(fqr1) self._fqr2 = FastqIterator(fqr2)
def collect_fastq_stats(fastq): """ Get barcode and distince UMI counts for Fastq file Used by Icell8Stats to collect counts for each file supplied. Arguments: fastq (str): path to Fastq file Returns: Tuple: tuple consisting of (fastq,counts,umis) where 'fastq' is the path to the input Fastq file, 'counts' is a dictionary with barcodes as keys and read counts as values, and 'umis' is a dictionary with barcodes as keys and sets of UMIs as values. """ counts = {} umis = {} for r in FastqIterator(fastq): r = ICell8Read1(r) barcode = r.barcode try: counts[barcode] += 1 except KeyError: counts[barcode] = 1 umi = r.umi try: umis[barcode].add(umi) except KeyError: umis[barcode] = set((umi, )) return (fastq, counts, umis)
def pair_fastqs(fastqs): """ Automagically pair up FASTQ files Given a list of FASTQ files, generate a list of R1/R2 pairs by examining the header for the first read in each file. Arguments: fastqs (list): list of paths to FASTQ files which will be paired. Returns: Tuple: pair of lists of the form (paired,unpaired), where `paired` is a list of tuples consisting of FASTQ R1/R2 pairs and `unpaired` is a list of FASTQs which couldn't be paired. """ fq_pairs = [] seq_ids = {} bad_files = [] for fq in [os.path.abspath(fq) for fq in fastqs]: # Get header from first read seq_id = None with get_fastq_file_handle(fq) as fp: for r in FastqIterator(fp=fp): seq_id = r.seqid break if seq_id is None: logging.debug("'Bad' file: %s" % fq) bad_files.append(fq) continue fq_pair = None for fq1 in seq_ids: if seq_id.is_pair_of(seq_ids[fq1]): # Found a pair if seq_id.pair_id == '1': fq_pair = (fq, fq1) else: fq_pair = (fq1, fq) fq_pairs.append(fq_pair) logging.debug("*** Paired: %s\n" " : %s" % fq_pair) # Remove paired fastq del (seq_ids[fq1]) break if fq_pair is None: # Unable to pair, store for now logging.debug("Unpaired: %s" % fq) seq_ids[fq] = seq_id # Sort pairs into order fq_pairs = sorted(fq_pairs, key=lambda x: x[0]) unpaired = sorted(list(seq_ids.keys()) + bad_files) # Return paired and upaired fastqs return (fq_pairs, unpaired)
def collect_fastq_stats(self,fastq): """ Get barcode and distinct UMI counts for Fastq file This method can be called directly, but is also invoked implicitly if its parent instance is called. Arguments: fastq (str): path to Fastq file Returns: Tuple: tuple consisting of (fastq,counts,umis) where 'fastq' is the path to the input Fastq file, 'counts' is a dictionary with barcodes as keys and read counts as values, and 'umis' is a dictionary with barcodes as keys and sets of UMIs as values. """ print("collect_fastq_stats: started: %s" % fastq) try: n = FastqReadCounter.zcat_wc(fastq) print("%s: processing %d read%s" % ( os.path.basename(fastq), n,('s' if n != 1 else ''))) counts = {} umis = {} progress = ProgressChecker(percent=5,total=n) for i,r in enumerate(FastqIterator(fastq),start=1): r = ICell8Read1(r) barcode = r.barcode try: counts[barcode] += 1 except KeyError: counts[barcode] = 1 umi = r.umi try: umis[barcode].add(umi) except KeyError: umis[barcode] = set((umi,)) if self._verbose: if progress.check(i): print("%s: %s: processed %d reads (%.1f%%)" % (time.strftime("%Y%m%d.%H%M%S"), os.path.basename(fastq), i,progress.percent(i))) except Exception as ex: print("collect_fastq_stats: caught exception: '%s'" % ex) raise Exception("collect_fastq_stats: %s: caught exception " "'%s'",(fastq,ex)) print("collect_fastq_stats: returning: %s" % fastq) return (fastq,counts,umis)
def get_read_number(fastq): """ Get the read number (1 or 2) from a Fastq file Arguments: fastq (str): path to a Fastq file Returns: Integer: read number (1 or 2) extracted from the first read. """ for r in FastqIterator(fastq): seq_id = r.seqid break return int(seq_id.pair_id)
def count_barcodes(fastqs): """ Count the barcodes from multiple fastqs """ print "Reading in %s fastq%s" % (len(fastqs), ('' if len(fastqs) == 1 else 's')) counts = BarcodeCounter() for fq in fastqs: print "%s" % os.path.basename(fq) for r in FastqIterator(fq): seq = r.seqid.index_sequence lane = int(r.seqid.flowcell_lane) counts.count_barcode(seq, lane) return counts
def fastqiterator(fastq=None, fp=None): """ Return number of reads in a FASTQ file Uses the FASTQFile.FastqIterator class to do the counting. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Number of reads """ nreads = 0 for r in FastqIterator(fastq_file=fastq, fp=fp): nreads += 1 return nreads
def assign_barcodes_single_end(fastq_in, fastq_out, n=5): """ Extract inline barcodes and assign to Fastq read headers Strips the first n bases from each read of the input FASTQ file and assigns it to the index sequence for that read in the output file. If the supplied output file name ends with '.gz' then it will be gzipped. Arguments: fastq_in (str): input FASTQ file (can be gzipped) fastq_out (str): output FASTQ file (will be gzipped if ending with '.gz') n (integer): number of bases to extract and assign as index sequence (default: 5) Returns: Integer: number of reads processed. """ if fastq_out.endswith('.gz'): fp = gzip.GzipFile(filename=fastq_out, mode='wb') else: fp = open(fastq_out, 'w') print("Processing reads from %s" % fastq_in) nread = 0 for read in FastqIterator(fastq_in): # Extract new barcode sequence barcode = read.sequence[:n] # Truncate sequence and quality accordingly sequence = read.sequence[n:] quality = read.quality[n:] # Assign new values and write to output read.seqid.index_sequence = barcode read.sequence = sequence read.quality = quality fp.write("%s\n" % read) nread += 1 fp.close() print("Finished (%d reads processed)" % nread) return nread
def reads_per_lane(fastq=None, fp=None): """ Return counts of reads in each lane of FASTQ file Uses the FASTQFile.FastqIterator class to do the counting, with counts split by lane. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Dictionary where keys are lane numbers (as integers) and values are number of reads in that lane. """ nreads = {} for r in FastqIterator(fastq_file=fastq, fp=fp): lane = int(r.seqid.flowcell_lane) try: nreads[lane] += 1 except KeyError: nreads[lane] = 1 return nreads
ndatabases_mammalian = len(nohits_mammalian) ndatabases_contaminants = len(nohits_contaminants) print "'nohits' tags: '%s' and '%s'" % (nohits_mammalian, nohits_contaminants) # Output filtered FASTQ pair fqr1_out = os.path.basename(strip_ext(fqr1,'.fastq')) \ + '.filtered.fastq' fqr2_out = os.path.basename(strip_ext(fqr2,'.fastq')) \ + '.filtered.fastq' output_fqs = OutputFiles(base_dir=out_dir) output_fqs.open('fqr1', fqr1_out) output_fqs.open('fqr2', fqr2_out) # Filter the iCell8 read pairs against the tagged reads for pair, pref, contam in izip(ICell8FastqIterator(fqr1, fqr2), FastqIterator(mammalian_tagged_fq), FastqIterator(contaminants_tagged_fq)): # Get the tags pref_tag = extract_fastq_screen_tag(pref) contam_tag = extract_fastq_screen_tag(contam) # Check number of databases are consistent if len(pref_tag) != ndatabases_mammalian: logging.critical("Mismatch in mammalian tag: " "len('%s') != len('%s')" % (pref_tag, nohits_mammalian)) sys.exit(1) if len(contam_tag) != ndatabases_contaminants: logging.critical("Mismatch in contaminant tag: " "len('%s') != len('%s')" % (contam_tag, nohits_contaminants)) sys.exit(1)
def get_sequence_lengths(fastq, outfile=None, show_progress=False, limit=None): """ Get sequence lengths and masking statistics for Fastq Returns a dictionary with the following keys: - fastq: the Fastq file that metrics were calculated from - nreads: total number of reads processed - nreads_masked: number of reads that are completely masked (i.e. consist only of 'N's) - nreads_padded: number of partially masked reads (i.e. contain trailing 'N's) - frac_reads_masked: fraction of the processed reads which are masked - frac_reads_padded: fraction of the processed reads which are padded - min_length: minimum read length - max_length: maximum read length - mean_length: mean read length - median_length: median read length - seq_lengths_dist: distribution of lengths for all reads - seq_lengths_masked_dist: distribution of lengths for masked reads - seq_lengths_padded_dist: distribution of lengths for padded reads The distributions are each themselves dictionaries where the keys are read lengths and the values are the number of reads with the matching length; note that only lengths with a non-zero number of reads are included (zeroes are implied for all other lengths). Arguments: fastq (str): path to Fastq file outfile (str): optional, path to output JSON file show_progress (bool): if True then print message to stdout every 100000 reads indicating progress (default: operate silently) limit (int): if set then only process this number of reads from the head of the Fastq and return stats based on these (default: process all reads in the file) Returns: Dictionary: containing the metrics for the Fastq. """ nreads = 0 nreads_masked = 0 nreads_padded = 0 sequence_length = dict() reads_all_n = dict() reads_padded = dict() if show_progress: print("\n%s" % fastq) for r in FastqIterator(fastq): nreads += 1 if show_progress and nreads % 100000 == 0: print("...%d reads" % nreads) if limit and nreads == limit: logging.info("Stopping at limit: %d" % limit) break seqlen = r.seqlen try: sequence_length[seqlen] += 1 except KeyError: sequence_length[seqlen] = 1 nns = 0 seqlen_no_ns = len(r.sequence.strip('N')) if seqlen_no_ns == 0: nreads_masked += 1 try: reads_all_n[seqlen] += 1 except KeyError: reads_all_n[seqlen] = 1 elif seqlen_no_ns < seqlen: nreads_padded += 1 try: reads_padded[seqlen] += 1 except KeyError: reads_padded[seqlen] = 1 # Get statistics seqlens = sorted(sequence_length.keys()) min_len = min(seqlens) max_len = max(seqlens) mean_len = float(sum([l*sequence_length[l] for l in seqlens]))\ /float(nreads) median_read = nreads // 2 median_len = None read_count = 0 for l in seqlens: read_count += sequence_length[l] if read_count >= median_read: median_len = l break frac_reads_masked = float(nreads_masked) / nreads * 100.0 frac_reads_padded = float(nreads_padded) / nreads * 100.0 # Build dictionary for output stats = dict(fastq=fastq, nreads=nreads, nreads_masked=nreads_masked, nreads_padded=nreads_padded, frac_reads_masked=frac_reads_masked, frac_reads_padded=frac_reads_padded, min_length=min_len, max_length=max_len, mean_length=mean_len, median_length=median_len, seq_lengths_dist=sequence_length, seq_lengths_masked_dist=reads_all_n, seq_lengths_padded_dist=reads_padded) # Output to file if outfile: with open(outfile, 'wt') as fp: json.dump(stats, fp, sort_keys=True, indent=4) # Return stats return stats