예제 #1
0
class FastqIO:
    """ 
	This is a class for reading fastq files
	
	Attributes
	----------
	r1: file handle of R1 fastq file
	r2: file handle of R2 fastq file
	r1_phred: the cumulative phred score at each position in R1; list of length 500; record while reading R1
	r2_phred: the cumulative phred score at each position in R2; list of length 500; record while reading R2
	r1_length: the length distribution of R1; dict of read length and count; record while reading R1
	r2_length: the length distribution of R2; dict of read length and count; record while reading R2
	qc: true/false if recording read information of R1 and R2
		
	"""
    def __init__(self, fastq_r1, fastq_r2, quality_check=False):
        """ 
		the __init__ method
		
		Arguments
		----------
		fastq_r1: the path to the R1 fastq file
		fastq_r2: the path to the R2 fastq file
		quality_check: true/false if recording read information of R1 and R2
		
		"""
        self.r1 = FileIO(fastq_r1, "r")
        self.r2 = FileIO(fastq_r2, "r")
        self.r1_phred = [
            0,
        ] * 500  #500 bp, long enough for most NGSs
        self.r2_phred = [
            0,
        ] * 500
        self.r1_length = {}
        self.r2_length = {}
        #self.n = 0
        self.qc = quality_check

    def load(self, fastq_r1, fastq_r2):
        """ 
		open new R1 and R2 fastq files
		
		Arguments
		----------
		fastq_r1: the path to the R1 fastq file
		fastq_r2: the path to the R2 fastq file
		
		"""
        if (self.r1):
            self.r1.close()
        self.r1 = FileIO(fastq_r1, "r")
        if (self.r2):
            self.r2.close()
        self.r2 = FileIO(fastq_r2, "r2")

    def __iter__(self):
        return self

    def __quality__(self, phred, length):
        """ 
		compute the average base quality for each position in a list
		
		Arguments
		----------
		phred: a list of cumulative phred scores
		length: a dict of read length and count
		
		Returns:
		----------
		a list of the average base quality for each position in the phred list
		
		"""
        length = [(i, length[i]) for i in sorted(length.keys())]
        phred_avg = [
            0,
        ] * 500
        for i, v in enumerate(length):
            for j in range(i):
                phred_avg[j] += v[1]
        for i in range(len(phred_avg)):
            if (phred_avg):
                phred_avg[i] = phred[i] / float(phred_avg[i])
        return phred_avg

    def quality(self):
        """ 
		compute the average base quality for each position in R1 and R2
		
		use __quality__(...) and the read quality and length information were recorded. 
				
		Returns:
		----------
		a tuple of
		1. a list of the average base quality for each position in R1
		2. a list of the average base quality for each position in R2
		
		"""
        r1_quality = self.__quality__(self.r1_phred, self.r1_length)
        r2_quality = self.__quality__(self.r2_phred, self.r2_length)
        return (r1_quality, r2_quality)

    def next(self):
        """ 
		the next function for iterating each read pair in the fastq files provided
		
		Returns:
		----------
		a tuple of
		1. R1: a tuple of read name, read sequence, read quality
		2. R2: a tuple of read name, read sequence, read quality
		
		"""
        state = 0
        n_r1 = 0
        n_r2 = 0
        while (True):
            #Reads in R1 and R2 should be in pair and in the same order.
            while (True):
                line_r1 = self.r1.readline()
                if (not line_r1):
                    raise StopIteration
                line_r1 = line_r1.rstrip("\r\n")
                n_r1 += 1
                if (line_r1):
                    break
            while (True):
                line_r2 = self.r2.readline()
                if (not line_r2):
                    raise StopIteration
                line_r2 = line_r2.rstrip("\r\n")
                n_r2 += 1
                if (line_r2):
                    break
            if (state == 0):
                #read name line
                if (line_r1[0] != "@"):
                    raise ValueError("Error: R1 read name %s @ line %d" %
                                     (line_r1, n_r1))
                if (line_r2[0] != "@"):
                    raise ValueError("Error: R2 read name %s @ line %d" %
                                     (line_r2, n_r2))
                name_r1 = line_r1
                name_r2 = line_r2
                state = 1
            elif (state == 1):
                #read line
                read_r1 = line_r1
                read_r2 = line_r2
                state = 2
            elif (state == 2):
                #info line
                if (line_r1[0] != "+"):
                    raise ValueError("Error: R1 info %s @ line %d" %
                                     (line_r1, n_r1))
                if (line_r2[0] != "+"):
                    raise ValueError("Error: R2 info %s @ line %d" %
                                     (line_r2, n_r2))
                state = 3
            elif (state == 3):
                state = 0
                #quality line
                if (len(read_r1) != len(line_r1)):
                    raise ValueError(
                        "Error: R1 read length (%d) and quality length (%d) do not match"
                        % (len(read_r1), len(line_r1)))
                if (len(read_r2) != len(line_r2)):
                    raise ValueError(
                        "Error: R2 read length (%d) and quality length (%d) do not match"
                        % (len(read_r2), len(line_r2)))
                quality_r1 = line_r1
                quality_r2 = line_r2
                if (self.qc):
                    #obsolete
                    #compute cumulative quality in R1 and R2
                    phred_q1 = phred(quality_r1)
                    for i, q in enumerate(phred_q1):
                        self.r1_phred[i] += q
                    phred_q2 = phred(quality_r2)
                    for i, q in enumerate(phred_q2):
                        self.r2_phred[i] += q
                    #R1 and R2 length distribution
                    self.r1_length[len(quality_r1)] += 1
                    self.r2_length[len(quality_r2)] += 1
                    return ((name_r1, read_r1, quality_r1, self.r1_phred),
                            (name_r2, read_r2, quality_r2, self.r2_phred))
                else:
                    return ((name_r1, read_r1, quality_r1), (name_r2, read_r2,
                                                             quality_r2))
예제 #2
0
파일: pileup.py 프로젝트: mtstamp/stamp
def generate_pileup_file(sample_name, outpath, gzip_pileup, alignment_file, alignment_summary, probe_file, mtdna_refseq, qual_min, mtdna_offset):
	"""
	a function wrapper to generate the pileup file from the alignment file using samtools
	mtdna positions are corrected to those of rCRS
	
	Arguments
	----------
	sample_name: the name of the sample
	outpath: the path to store the output files
	gzip_pileup: compress the output file
	alignment_file: the processed alignment file returned from filter_alignment_file(...)
	alignment_summary: a dict of read summary returned from filter_alignment_file(...)
	probe_file: the path to the probe file
	mtdna_refseq: the mtdna reference sequence
	qual_min: the minimum quality score to output (used in samtools mpileup -Q )
	mtdna_offset: the position offset used to parse mtdna sites.
	
	Returns
	----------
	None
	
	Outputs
	----------
	${output}/${sample_name}.mtdna.consensus.adj.pileup(.gz): the resulting pileup file
	${output}/${sample_name}.coverage: the read coverage information for all mtdna sites (tsv file)  
	
	"""
	
	#parse amplicon information
	mtdna_len = 16569
	read_len = 250
	
	#amplicon information for each position in mtdna
	amp_info = [[] for i in xrange(mtdna_len+mtdna_offset+1)] #amplicon covered at each position
	amp_cov = [0,]*(mtdna_len+mtdna_offset+1) #amplicons sequencing depth at each position
	amp_r1_pos = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position at read 1
	amp_r2_pos = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position at read 2
	amp_r1_probe = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position to the r1 probe
	amp_r2_probe = [mtdna_len,]*(mtdna_len+mtdna_offset+1) #relative position to the r2 probe
	
	#parse probe file for amplicon imformation
	with open(probe_file, "r") as fh:
		for line in fh:
			line = line.rstrip("\r\n")
			if (not line):
				continue
			name, chr, start, end, s1, s2, r1_probe, r2_probe, blen = line.split("\t")
			start = int(start)
			end = int(end)
			#length of r1 and r2 probes
			if (s1 == "+"):
				p1 = len(r1_probe.strip())
				p2 = len(r2_probe.strip())
			else:
				p1 = len(r2_probe.strip())
				p2 = len(r1_probe.strip())
			#barcode length
			blen = int(blen)
			if (chr == "chrM"):
				#number of amplicons in the QC+ bam file
				cov = alignment_summary.get(name, 0)
				amp = []
				if (start < 0):
					#split the amplicon into halves in the D-loop region
					amp.append([mtdna_len+start, mtdna_len, s1, p1, 0])
					amp.append([1, end, s1, 0, p2])
				else:
					amp.append([start, end, s1, p1, p2])
				#positions in r1 and r2 reads
				#positions in probe
				for start, end, s1, p1, p2 in amp:
					for i in range(start+p1, end-p2+1):
						amp_info[i].append("%s(%s)"%(name,s1))
						amp_cov[i] += cov
					if (s1 == "+"):
						for i in xrange(p1):
							amp_r1_probe[i+start] = p1-i #position in R1 probe
						for i in xrange(start+p1, end-p2+1):
							amp_r1_pos[i] = min(amp_r1_pos[i], i-start) #position in R1
							amp_r2_pos[i] = min(amp_r2_pos[i], end+1-i+blen) #position in R2
						for i in xrange(p2): #position in R2 probe
							amp_r2_probe[end-i] = p2-i
					else:
						for i in xrange(p1):
							amp_r2_probe[i+start] = p1-i
						for i in xrange(start+p1, end-p2+1):
							amp_r2_pos[i] = min(amp_r2_pos[i], i-start+blen)
							amp_r1_pos[i] = min(amp_r1_pos[i], end+1-i)
						for i in xrange(p2):
							amp_r1_probe[end-i] = p2-i
	
	if (alignment_file.endswith(".bam")):
		alignment_file = alignment_file[:-4]
	
	#sort reads according to the aligned mtDNA positions
	#execute("%s sort -o %s.sorted.bam %s.bam " % (samtools, alignment_file, alignment_file))
	
	#pileup reads using samtools
	#mapq >= 20 & baseq >= qual_min
	#pf = pipe_output("%s mpileup -q 20 -Q %d -B -d 500000 -f %s %s.sorted.bam" % (samtools, qual_min, mtdna_refseq, alignment_file))
	pf = pipe_output("%s mpileup -q 20 -Q %d -B -d 500000 -f %s %s.bam" % (samtools, qual_min, mtdna_refseq, alignment_file))
	
	#summarize site coverage
	out_coverage = open(outpath + os.path.sep + sample_name + ".coverage", "w")
	head = ["chr", "pos", "pos.adj", "ref", "depth", "Q0", "Q1", "Q2", "Q3", "Q4", "amps", "amp.r1.pos","amp.r2.pos","amp.r1.probe","amp.r2.probe","amp.info"]
	out_coverage.write("\t".join(head) + "\n")
	
	#trim and move the shifted reads to the correct rCRS positions
	out_name =  outpath + os.path.sep + sample_name + ".mtdna.consensus.adj.pileup"
	if (gzip_pileup):
		out_name += ".gz"
	out_pileup = FileIO(out_name, "w", compresslevel=3)
	
	#temporarily store amplicons mapped to the end of the shifted mtDNA (the last mtdna_offset bps)
	tmp_line = {}
	
	#iterate reads in the pileup file generated
	for line in pf.stdout:
		line = line.rstrip("\r\n")
		if (not line):
			continue
		chr, pos, ref, depth, r, q = line.split("\t")
		qual = [0,0,0,0,0]
		#group quals into <10, 10-20, 20-30, 30-40, >40
		for i in phred(q):
			i = int(i)/10
			if (i >= 4):
				i = 4
			qual[i] += 1
		pos = int(pos)
		depth = int(depth)
		pos_adj = pos - mtdna_offset
		if (pos_adj > 0):
			l = tmp_line.get(pos_adj)
			if (l):
				chr1, ref1, depth1, qual1, r1, q1 = l
				assert ref1 == ref, "the reference allele does not match at position %d" % pos_adj
				#pileup reads if they aligned to the same positions
				depth = int(depth) + int(depth1)
				#concatenate reads and read qualities 
				r += r1
				q += q1
				#sum up quality stats
				qual = [i+j for i,j in zip(qual, qual1)]
				#delete temp records for the position
				del tmp_line[pos_adj]
			#output coverage and quality stats
			out_coverage.write("\t".join(map(str, [chr, pos, pos_adj, ref, depth] + qual + [amp_cov[pos_adj], amp_r1_pos[pos_adj], amp_r2_pos[pos_adj], amp_r1_probe[pos_adj], amp_r2_probe[pos_adj],"|".join(amp_info[pos_adj])]))+"\n")
			#output reads
			out_pileup.write("\t".join([chr, str(pos_adj), ref, str(depth), r, q])+"\n")
		else:
			#temporarily store reads aligned to the last mtdna_offset bps
			pos = mtdna_len + pos_adj
			tmp_line[pos] = [chr, ref, depth, qual, r, q]
	if (tmp_line):
		#output reads aligned to the last mtdna_offset bps
		for pos_adj in sorted(tmp_line.keys()):
			chr, ref, depth, qual, r, q = tmp_line[pos_adj]
			out_coverage.write("\t".join(map(str, [chr, pos_adj-mtdna_len, pos_adj, ref, depth] + qual + [amp_cov[pos_adj], amp_r1_pos[pos_adj], amp_r2_pos[pos_adj], amp_r1_probe[pos_adj], amp_r2_probe[pos_adj],"|".join(amp_info[pos_adj])]))+"\n")
			out_pileup.write("\t".join([chr, str(pos_adj), ref, str(depth), r, q])+"\n")
	
	#close file handles
	pf.stdout.close()
	out_pileup.close()
	out_coverage.close()
예제 #3
0
파일: scan.py 프로젝트: mtstamp/stamp
class MTScan:
    """ 
	This is a class for processing a mpileup file
	
	Attributes
	----------
	fh: file handle
	fh_to_close: true/false
	sample: sample index in the mpileup file to extract (list)
	name: names of the samples (list)
	
	QC filters to call heteroplasmies
	arguments used in MTSite.callAllele(...)
	base_quality: the minimum base quality
	min_reads_rate: the minimum rate of bases with BAQ >= base_quality
	min_depth: the minimum read depth
	min_depth_fwd: the minimum read depth on the forward strand
	min_depth_rev: the minimum read depth on the reverse strand
	min_minor_depth: the minimum read depth of the minor allele
	min_minor_depth_fwd: the minimum read depth of the minor allele on the forward strand
	min_minor_depth_rev: the minimum read depth of the minor allele on the reverse strand
	min_het_freq: the minimum fraction of the minor allele
	
	"""
    def __init__(self,
                 pileup_file,
                 sample=0,
                 name="s",
                 base_quality=20,
                 min_reads_rate=0.5,
                 min_depth=10,
                 min_depth_fwd=1,
                 min_depth_rev=1,
                 min_minor_depth=1,
                 min_minor_depth_fwd=1,
                 min_minor_depth_rev=1,
                 min_het_freq=0.01):
        """ 
		the __init__ method
		
		Arguments
		----------
		see class Attributes 
		"""
        if (isinstance(pileup_file, str)):
            self.fh = FileIO(pileup_file, "r")
            self.fh_to_close = True
        else:
            self.fh = pileup_file
            self.fh_to_close = False
        if (isinstance(name, str)):
            name = [
                name,
            ]
        if (sample is None):
            sample = range(len(name))
        elif (isinstance(sample, int)):
            sample = [
                sample,
            ]
        assert len(sample) == len(
            name), "Sample and Name should be of same length."
        self.sample = sample
        self.name = name
        self.base_quality = base_quality
        self.min_reads_rate = min_reads_rate
        self.min_depth = min_depth
        self.min_depth_fwd = min_depth_fwd
        self.min_depth_rev = min_depth_rev
        self.min_minor_depth = min_minor_depth
        self.min_minor_depth_fwd = min_minor_depth_fwd
        self.min_minor_depth_rev = min_minor_depth_rev
        self.min_het_freq = min_het_freq

    def __del__(self):
        #close the file handle
        if (self.fh_to_close):
            self.fh.close()

    def allSites(self, end=16570):
        """ 
		call mtDNA variants at all sites
		
		Arguments
		----------
		end: the end position
		
		Returns:
		----------
		an iterator of tuples containing
		1. lists of MTSite for samples indicated
		2. int position
		3. is a variant (true/false) 
		"""
        cur = 1
        is_var = False
        var = [
            None,
        ] * len(self.sample)
        #iterate all pileup lines
        for line in readMpileup(self.fh, self.sample, self.name,
                                self.base_quality, self.min_reads_rate):
            idx, name, chr, pos, ref, depth, allele_count, ins_count, del_count, rq = line
            if (pos > cur):
                #new line
                yield var, cur, is_var
                #reset var
                var = [
                    None,
                ] * len(self.sample)
                is_var = False
                cur += 1
                #output empty lines
                for p in range(cur, pos):
                    yield var, p, is_var
                #move cur to pos
                cur = pos
            #new site information
            site = MTSite(chr, pos, depth, ref, allele_count, ins_count,
                          del_count, rq)
            #determine variant alleles
            site.callAllele(self.min_depth, self.min_depth_fwd,
                            self.min_depth_rev, self.min_minor_depth,
                            self.min_minor_depth_fwd, self.min_minor_depth_rev,
                            self.min_het_freq)
            #determine variant status
            if (site.is_heteroplasmy or site.is_substitution):
                is_var = True
            #temporarily store this variant
            var[idx] = site
        yield var, cur, is_var
        var = [
            None,
        ] * len(self.sample)
        is_var = False
        #output empty lines for the remaining sites
        for p in range(cur + 1, end):
            yield var, p, is_var

    def varSites(self):
        """ 
		call mtDNA variants at only variant sites
		same as allSites(...) but only return sites where variants are found
		
		Arguments
		----------
		end: the end position
		
		Returns:
		----------
		an iterator of tuples containing
		1. lists of MTSite for samples indicated
		2. int position
		"""
        cur = -1
        is_var = False
        var = [
            None,
        ] * len(self.sample)
        for line in readMpileup(self.fh, self.sample, self.name,
                                self.base_quality, self.min_reads_rate):
            idx, name, chr, pos, ref, depth, allele_count, ins_count, del_count, rq = line
            if (cur == -1):
                cur = pos
            if (pos != cur):
                if (is_var):
                    #return when it is a variant
                    yield var, cur
                var = [
                    None,
                ] * len(self.sample)
                is_var = False
                cur = pos
            #new site information
            site = MTSite(chr, pos, depth, ref, allele_count, ins_count,
                          del_count, rq)
            #determine variant alleles
            site.callAllele(self.min_depth, self.min_depth_fwd,
                            self.min_depth_rev, self.min_minor_depth,
                            self.min_minor_depth_fwd, self.min_minor_depth_rev,
                            self.min_het_freq)
            #determine variant status
            if (site.is_heteroplasmy or site.is_substitution):
                is_var = True
            #temporarily store this variant
            var[idx] = site
        if (is_var):
            yield var, cur

    def reset(self):
        #set the current position in the file handle to the beginning of the file
        self.fh.seek(0)