class FastqIO: """ This is a class for reading fastq files Attributes ---------- r1: file handle of R1 fastq file r2: file handle of R2 fastq file r1_phred: the cumulative phred score at each position in R1; list of length 500; record while reading R1 r2_phred: the cumulative phred score at each position in R2; list of length 500; record while reading R2 r1_length: the length distribution of R1; dict of read length and count; record while reading R1 r2_length: the length distribution of R2; dict of read length and count; record while reading R2 qc: true/false if recording read information of R1 and R2 """ def __init__(self, fastq_r1, fastq_r2, quality_check=False): """ the __init__ method Arguments ---------- fastq_r1: the path to the R1 fastq file fastq_r2: the path to the R2 fastq file quality_check: true/false if recording read information of R1 and R2 """ self.r1 = FileIO(fastq_r1, "r") self.r2 = FileIO(fastq_r2, "r") self.r1_phred = [ 0, ] * 500 #500 bp, long enough for most NGSs self.r2_phred = [ 0, ] * 500 self.r1_length = {} self.r2_length = {} #self.n = 0 self.qc = quality_check def load(self, fastq_r1, fastq_r2): """ open new R1 and R2 fastq files Arguments ---------- fastq_r1: the path to the R1 fastq file fastq_r2: the path to the R2 fastq file """ if (self.r1): self.r1.close() self.r1 = FileIO(fastq_r1, "r") if (self.r2): self.r2.close() self.r2 = FileIO(fastq_r2, "r2") def __iter__(self): return self def __quality__(self, phred, length): """ compute the average base quality for each position in a list Arguments ---------- phred: a list of cumulative phred scores length: a dict of read length and count Returns: ---------- a list of the average base quality for each position in the phred list """ length = [(i, length[i]) for i in sorted(length.keys())] phred_avg = [ 0, ] * 500 for i, v in enumerate(length): for j in range(i): phred_avg[j] += v[1] for i in range(len(phred_avg)): if (phred_avg): phred_avg[i] = phred[i] / float(phred_avg[i]) return phred_avg def quality(self): """ compute the average base quality for each position in R1 and R2 use __quality__(...) and the read quality and length information were recorded. Returns: ---------- a tuple of 1. a list of the average base quality for each position in R1 2. a list of the average base quality for each position in R2 """ r1_quality = self.__quality__(self.r1_phred, self.r1_length) r2_quality = self.__quality__(self.r2_phred, self.r2_length) return (r1_quality, r2_quality) def next(self): """ the next function for iterating each read pair in the fastq files provided Returns: ---------- a tuple of 1. R1: a tuple of read name, read sequence, read quality 2. R2: a tuple of read name, read sequence, read quality """ state = 0 n_r1 = 0 n_r2 = 0 while (True): #Reads in R1 and R2 should be in pair and in the same order. while (True): line_r1 = self.r1.readline() if (not line_r1): raise StopIteration line_r1 = line_r1.rstrip("\r\n") n_r1 += 1 if (line_r1): break while (True): line_r2 = self.r2.readline() if (not line_r2): raise StopIteration line_r2 = line_r2.rstrip("\r\n") n_r2 += 1 if (line_r2): break if (state == 0): #read name line if (line_r1[0] != "@"): raise ValueError("Error: R1 read name %s @ line %d" % (line_r1, n_r1)) if (line_r2[0] != "@"): raise ValueError("Error: R2 read name %s @ line %d" % (line_r2, n_r2)) name_r1 = line_r1 name_r2 = line_r2 state = 1 elif (state == 1): #read line read_r1 = line_r1 read_r2 = line_r2 state = 2 elif (state == 2): #info line if (line_r1[0] != "+"): raise ValueError("Error: R1 info %s @ line %d" % (line_r1, n_r1)) if (line_r2[0] != "+"): raise ValueError("Error: R2 info %s @ line %d" % (line_r2, n_r2)) state = 3 elif (state == 3): state = 0 #quality line if (len(read_r1) != len(line_r1)): raise ValueError( "Error: R1 read length (%d) and quality length (%d) do not match" % (len(read_r1), len(line_r1))) if (len(read_r2) != len(line_r2)): raise ValueError( "Error: R2 read length (%d) and quality length (%d) do not match" % (len(read_r2), len(line_r2))) quality_r1 = line_r1 quality_r2 = line_r2 if (self.qc): #obsolete #compute cumulative quality in R1 and R2 phred_q1 = phred(quality_r1) for i, q in enumerate(phred_q1): self.r1_phred[i] += q phred_q2 = phred(quality_r2) for i, q in enumerate(phred_q2): self.r2_phred[i] += q #R1 and R2 length distribution self.r1_length[len(quality_r1)] += 1 self.r2_length[len(quality_r2)] += 1 return ((name_r1, read_r1, quality_r1, self.r1_phred), (name_r2, read_r2, quality_r2, self.r2_phred)) else: return ((name_r1, read_r1, quality_r1), (name_r2, read_r2, quality_r2))
def generate_pileup_file(sample_name, outpath, gzip_pileup, alignment_file, alignment_summary, probe_file, mtdna_refseq, qual_min, mtdna_offset): """ a function wrapper to generate the pileup file from the alignment file using samtools mtdna positions are corrected to those of rCRS Arguments ---------- sample_name: the name of the sample outpath: the path to store the output files gzip_pileup: compress the output file alignment_file: the processed alignment file returned from filter_alignment_file(...) alignment_summary: a dict of read summary returned from filter_alignment_file(...) probe_file: the path to the probe file mtdna_refseq: the mtdna reference sequence qual_min: the minimum quality score to output (used in samtools mpileup -Q ) mtdna_offset: the position offset used to parse mtdna sites. Returns ---------- None Outputs ---------- ${output}/${sample_name}.mtdna.consensus.adj.pileup(.gz): the resulting pileup file ${output}/${sample_name}.coverage: the read coverage information for all mtdna sites (tsv file) """ #parse amplicon information mtdna_len = 16569 read_len = 250 #amplicon information for each position in mtdna amp_info = [[] for i in xrange(mtdna_len+mtdna_offset+1)] #amplicon covered at each position amp_cov = [0,]*(mtdna_len+mtdna_offset+1) #amplicons sequencing depth at each position amp_r1_pos = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position at read 1 amp_r2_pos = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position at read 2 amp_r1_probe = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position to the r1 probe amp_r2_probe = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position to the r2 probe #parse probe file for amplicon imformation with open(probe_file, "r") as fh: for line in fh: line = line.rstrip("\r\n") if (not line): continue name, chr, start, end, s1, s2, r1_probe, r2_probe, blen = line.split("\t") start = int(start) end = int(end) #length of r1 and r2 probes if (s1 == "+"): p1 = len(r1_probe.strip()) p2 = len(r2_probe.strip()) else: p1 = len(r2_probe.strip()) p2 = len(r1_probe.strip()) #barcode length blen = int(blen) if (chr == "chrM"): #number of amplicons in the QC+ bam file cov = alignment_summary.get(name, 0) amp = [] if (start < 0): #split the amplicon into halves in the D-loop region amp.append([mtdna_len+start, mtdna_len, s1, p1, 0]) amp.append([1, end, s1, 0, p2]) else: amp.append([start, end, s1, p1, p2]) #positions in r1 and r2 reads #positions in probe for start, end, s1, p1, p2 in amp: for i in range(start+p1, end-p2+1): amp_info[i].append("%s(%s)"%(name,s1)) amp_cov[i] += cov if (s1 == "+"): for i in xrange(p1): amp_r1_probe[i+start] = p1-i #position in R1 probe for i in xrange(start+p1, end-p2+1): amp_r1_pos[i] = min(amp_r1_pos[i], i-start) #position in R1 amp_r2_pos[i] = min(amp_r2_pos[i], end+1-i+blen) #position in R2 for i in xrange(p2): #position in R2 probe amp_r2_probe[end-i] = p2-i else: for i in xrange(p1): amp_r2_probe[i+start] = p1-i for i in xrange(start+p1, end-p2+1): amp_r2_pos[i] = min(amp_r2_pos[i], i-start+blen) amp_r1_pos[i] = min(amp_r1_pos[i], end+1-i) for i in xrange(p2): amp_r1_probe[end-i] = p2-i if (alignment_file.endswith(".bam")): alignment_file = alignment_file[:-4] #sort reads according to the aligned mtDNA positions #execute("%s sort -o %s.sorted.bam %s.bam " % (samtools, alignment_file, alignment_file)) #pileup reads using samtools #mapq >= 20 & baseq >= qual_min #pf = pipe_output("%s mpileup -q 20 -Q %d -B -d 500000 -f %s %s.sorted.bam" % (samtools, qual_min, mtdna_refseq, alignment_file)) pf = pipe_output("%s mpileup -q 20 -Q %d -B -d 500000 -f %s %s.bam" % (samtools, qual_min, mtdna_refseq, alignment_file)) #summarize site coverage out_coverage = open(outpath + os.path.sep + sample_name + ".coverage", "w") head = ["chr", "pos", "pos.adj", "ref", "depth", "Q0", "Q1", "Q2", "Q3", "Q4", "amps", "amp.r1.pos","amp.r2.pos","amp.r1.probe","amp.r2.probe","amp.info"] out_coverage.write("\t".join(head) + "\n") #trim and move the shifted reads to the correct rCRS positions out_name = outpath + os.path.sep + sample_name + ".mtdna.consensus.adj.pileup" if (gzip_pileup): out_name += ".gz" out_pileup = FileIO(out_name, "w", compresslevel=3) #temporarily store amplicons mapped to the end of the shifted mtDNA (the last mtdna_offset bps) tmp_line = {} #iterate reads in the pileup file generated for line in pf.stdout: line = line.rstrip("\r\n") if (not line): continue chr, pos, ref, depth, r, q = line.split("\t") qual = [0,0,0,0,0] #group quals into <10, 10-20, 20-30, 30-40, >40 for i in phred(q): i = int(i)/10 if (i >= 4): i = 4 qual[i] += 1 pos = int(pos) depth = int(depth) pos_adj = pos - mtdna_offset if (pos_adj > 0): l = tmp_line.get(pos_adj) if (l): chr1, ref1, depth1, qual1, r1, q1 = l assert ref1 == ref, "the reference allele does not match at position %d" % pos_adj #pileup reads if they aligned to the same positions depth = int(depth) + int(depth1) #concatenate reads and read qualities r += r1 q += q1 #sum up quality stats qual = [i+j for i,j in zip(qual, qual1)] #delete temp records for the position del tmp_line[pos_adj] #output coverage and quality stats out_coverage.write("\t".join(map(str, [chr, pos, pos_adj, ref, depth] + qual + [amp_cov[pos_adj], amp_r1_pos[pos_adj], amp_r2_pos[pos_adj], amp_r1_probe[pos_adj], amp_r2_probe[pos_adj],"|".join(amp_info[pos_adj])]))+"\n") #output reads out_pileup.write("\t".join([chr, str(pos_adj), ref, str(depth), r, q])+"\n") else: #temporarily store reads aligned to the last mtdna_offset bps pos = mtdna_len + pos_adj tmp_line[pos] = [chr, ref, depth, qual, r, q] if (tmp_line): #output reads aligned to the last mtdna_offset bps for pos_adj in sorted(tmp_line.keys()): chr, ref, depth, qual, r, q = tmp_line[pos_adj] out_coverage.write("\t".join(map(str, [chr, pos_adj-mtdna_len, pos_adj, ref, depth] + qual + [amp_cov[pos_adj], amp_r1_pos[pos_adj], amp_r2_pos[pos_adj], amp_r1_probe[pos_adj], amp_r2_probe[pos_adj],"|".join(amp_info[pos_adj])]))+"\n") out_pileup.write("\t".join([chr, str(pos_adj), ref, str(depth), r, q])+"\n") #close file handles pf.stdout.close() out_pileup.close() out_coverage.close()
class MTScan: """ This is a class for processing a mpileup file Attributes ---------- fh: file handle fh_to_close: true/false sample: sample index in the mpileup file to extract (list) name: names of the samples (list) QC filters to call heteroplasmies arguments used in MTSite.callAllele(...) base_quality: the minimum base quality min_reads_rate: the minimum rate of bases with BAQ >= base_quality min_depth: the minimum read depth min_depth_fwd: the minimum read depth on the forward strand min_depth_rev: the minimum read depth on the reverse strand min_minor_depth: the minimum read depth of the minor allele min_minor_depth_fwd: the minimum read depth of the minor allele on the forward strand min_minor_depth_rev: the minimum read depth of the minor allele on the reverse strand min_het_freq: the minimum fraction of the minor allele """ def __init__(self, pileup_file, sample=0, name="s", base_quality=20, min_reads_rate=0.5, min_depth=10, min_depth_fwd=1, min_depth_rev=1, min_minor_depth=1, min_minor_depth_fwd=1, min_minor_depth_rev=1, min_het_freq=0.01): """ the __init__ method Arguments ---------- see class Attributes """ if (isinstance(pileup_file, str)): self.fh = FileIO(pileup_file, "r") self.fh_to_close = True else: self.fh = pileup_file self.fh_to_close = False if (isinstance(name, str)): name = [ name, ] if (sample is None): sample = range(len(name)) elif (isinstance(sample, int)): sample = [ sample, ] assert len(sample) == len( name), "Sample and Name should be of same length." self.sample = sample self.name = name self.base_quality = base_quality self.min_reads_rate = min_reads_rate self.min_depth = min_depth self.min_depth_fwd = min_depth_fwd self.min_depth_rev = min_depth_rev self.min_minor_depth = min_minor_depth self.min_minor_depth_fwd = min_minor_depth_fwd self.min_minor_depth_rev = min_minor_depth_rev self.min_het_freq = min_het_freq def __del__(self): #close the file handle if (self.fh_to_close): self.fh.close() def allSites(self, end=16570): """ call mtDNA variants at all sites Arguments ---------- end: the end position Returns: ---------- an iterator of tuples containing 1. lists of MTSite for samples indicated 2. int position 3. is a variant (true/false) """ cur = 1 is_var = False var = [ None, ] * len(self.sample) #iterate all pileup lines for line in readMpileup(self.fh, self.sample, self.name, self.base_quality, self.min_reads_rate): idx, name, chr, pos, ref, depth, allele_count, ins_count, del_count, rq = line if (pos > cur): #new line yield var, cur, is_var #reset var var = [ None, ] * len(self.sample) is_var = False cur += 1 #output empty lines for p in range(cur, pos): yield var, p, is_var #move cur to pos cur = pos #new site information site = MTSite(chr, pos, depth, ref, allele_count, ins_count, del_count, rq) #determine variant alleles site.callAllele(self.min_depth, self.min_depth_fwd, self.min_depth_rev, self.min_minor_depth, self.min_minor_depth_fwd, self.min_minor_depth_rev, self.min_het_freq) #determine variant status if (site.is_heteroplasmy or site.is_substitution): is_var = True #temporarily store this variant var[idx] = site yield var, cur, is_var var = [ None, ] * len(self.sample) is_var = False #output empty lines for the remaining sites for p in range(cur + 1, end): yield var, p, is_var def varSites(self): """ call mtDNA variants at only variant sites same as allSites(...) but only return sites where variants are found Arguments ---------- end: the end position Returns: ---------- an iterator of tuples containing 1. lists of MTSite for samples indicated 2. int position """ cur = -1 is_var = False var = [ None, ] * len(self.sample) for line in readMpileup(self.fh, self.sample, self.name, self.base_quality, self.min_reads_rate): idx, name, chr, pos, ref, depth, allele_count, ins_count, del_count, rq = line if (cur == -1): cur = pos if (pos != cur): if (is_var): #return when it is a variant yield var, cur var = [ None, ] * len(self.sample) is_var = False cur = pos #new site information site = MTSite(chr, pos, depth, ref, allele_count, ins_count, del_count, rq) #determine variant alleles site.callAllele(self.min_depth, self.min_depth_fwd, self.min_depth_rev, self.min_minor_depth, self.min_minor_depth_fwd, self.min_minor_depth_rev, self.min_het_freq) #determine variant status if (site.is_heteroplasmy or site.is_substitution): is_var = True #temporarily store this variant var[idx] = site if (is_var): yield var, cur def reset(self): #set the current position in the file handle to the beginning of the file self.fh.seek(0)