def getQualityData(self, inputFname, read_sampling_rate=0.05, quality_score_format='Sanger'): """ 2011-8-15 """ sys.stderr.write("Getting base quality data from %s ...\n"%(inputFname)) quality_ls_per_position = [] quality_ls = [] no_of_bases_per_position = [] diNuc2count = {} diNuc2quality_ls = {} fname_prefix, fname_suffix = os.path.splitext(inputFname) if fname_suffix=='.gz': #the input file is gzipped. get the new prefix import gzip inf = gzip.open(inputFname, 'rb') else: inf = open(inputFname, 'r') counter = 0 real_counter = 0 for line in inf: if line[0]=='@': #a new read counter += 1 coin_toss = random.random() base_string = inf.next().strip() inf.next() quality_string = inf.next().strip() if coin_toss<=read_sampling_rate: real_counter += 1 read_length = len(base_string) if len(quality_ls_per_position)<read_length: # extend quality_ls_per_position to house more data extraNoOfBases = read_length-len(quality_ls_per_position) for j in xrange(extraNoOfBases): quality_ls_per_position.append([]) no_of_bases_per_position.append(0) for i in range(read_length): base = base_string[i] base_quality = quality_string[i] if quality_score_format=='Illumina1.3': phredScore = utils.getPhredScoreOutOfSolexaScore(base_quality) else: phredScore = ord(base_quality)-33 quality_ls_per_position[i].append(phredScore) quality_ls.append(phredScore) if base!='N': no_of_bases_per_position[i] += 1 if i<read_length-1: nextBase = base_string[i+1] if nextBase!='N': diNuc = base + nextBase if diNuc not in diNuc2quality_ls: diNuc2quality_ls[diNuc] = [] diNuc2count[diNuc] = 0 diNuc2quality_ls[diNuc].append(phredScore) diNuc2count[diNuc] += 1 if counter%5000==0 and self.report: sys.stderr.write("%s%s\t%s"%('\x08'*80, real_counter, counter)) #if baseCount>10000: #temporary, for testing # break del inf sys.stderr.write("%s/%s reads selected. Done.\n"%(real_counter, counter)) return PassingData(quality_ls_per_position=quality_ls_per_position, quality_ls=quality_ls, \ no_of_bases_per_position=no_of_bases_per_position, diNuc2quality_ls=diNuc2quality_ls, diNuc2count=diNuc2count)
def readFilter(self, readData, pmData=None): """ 2011-8-17 0. get smoothed phred score 1. head trimming 2. tail trimming 3. determine if the read after trimming is still long enough (>=minFinalReadLength) 4. convert low-quality base to N 5. if percentage of Ns is <=maxNPercentage """ base_string = readData.base_string quality_string = readData.quality_string halfWindowSize = getattr(pmData, 'halfWindowSize', 1) read_length = len(base_string) smooth_phred_score_ls = [] phred_score_ls = [] for i in range(read_length): base = base_string[i] base_quality = quality_string[i] if pmData.quality_score_format=='Illumina': phredScore = utils.getPhredScoreOutOfSolexaScore(base_quality) else: phredScore = ord(base_quality)-33 phred_score_ls.append(phredScore) if i>=halfWindowSize: #calculate the smooth quality for base at i-halfWindowSize smooth_start_index = max(i-halfWindowSize*2, 0) smooth_stop_index = i+1 smooth_quality = numpy.median(phred_score_ls[smooth_start_index:smooth_stop_index]) smooth_phred_score_ls.append(smooth_quality) leftOver = len(phred_score_ls)-len(smooth_phred_score_ls) for i in range(leftOver): indexOfInterest = len(smooth_phred_score_ls) smooth_start_index = max(indexOfInterest-halfWindowSize, 0) smooth_stop_index = min(indexOfInterest+halfWindowSize+1, len(phred_score_ls)) smooth_quality = numpy.median(phred_score_ls[smooth_start_index:smooth_stop_index]) smooth_phred_score_ls.append(smooth_quality) badHeadStopIndex = -1 #start from the one before 0 foundBadHeadStop = False badTailStart = len(base_string) #start from the one after the final base foundBadTailStart = False for i in range(read_length): if (foundBadHeadStop and foundBadTailStart) or badHeadStopIndex>=badTailStart: #stop right here break if foundBadHeadStop is False: #check starts from the beginning phredScore = smooth_phred_score_ls[i] if phredScore<pmData.minValidPhredScore: badHeadStopIndex += 1 elif phredScore>=pmData.minValidPhredScore: foundBadHeadStop = True if foundBadTailStart is False: #check starts from the tail phredScore = smooth_phred_score_ls[-(i+1)] if phredScore<pmData.minValidPhredScore: badTailStart -= 1 elif phredScore>=pmData.minValidPhredScore: foundBadTailStart = True filtered_read_length = badTailStart-badHeadStopIndex-1 returnData = None if filtered_read_length>=pmData.minFinalReadLength: filtered_old_base_string = base_string[badHeadStopIndex+1:badTailStart] filtered_base_string = '' filtered_quality_string = quality_string[badHeadStopIndex+1:badTailStart] filtered_phred_score_ls = phred_score_ls[badHeadStopIndex+1:badTailStart] no_of_Ns = 0. for i in range(filtered_read_length): if filtered_phred_score_ls[i]<pmData.minValidPhredScore: filtered_base_string += 'N' no_of_Ns += 1 else: filtered_base_string += filtered_old_base_string[i] N_percentage = no_of_Ns/filtered_read_length if N_percentage<=pmData.maxNPercentage: returnData=PassingData(base_string=filtered_base_string, quality_string=filtered_quality_string) return returnData