def get_fisher_pvalue(self,base): odds_ratio, fisher_pvalue = fisher( ((int(self.get_tumor_base_total(self.ref)), int(self.get_ctrl_base_total(self.ref))), (int(self.get_tumor_base_total(base)), int(self.get_ctrl_base_total(base)))), alternative='two-sided' ) val = float(0.0) if fisher_pvalue < 10**(-60): val = float(60.0) elif fisher_pvalue > 1.0 - 10**(-10) : val = float(0.0) else: val = -math.log( fisher_pvalue, 10 ) return val
def get_fisher_pvalue(self, base): odds_ratio, fisher_pvalue = fisher( ((int(self.get_tumor_base_total( self.ref)), int(self.get_ctrl_base_total(self.ref))), (int(self.get_tumor_base_total(base)), int(self.get_ctrl_base_total(base)))), alternative='two-sided') val = float(0.0) if fisher_pvalue < 10**(-60): val = float(60.0) elif fisher_pvalue > 1.0 - 10**(-10): val = float(0.0) else: val = -math.log(fisher_pvalue, 10) return val
def filter(self, in_tumor_bam, in_normal_bam, output, in_mutation_file): srcfile = open(in_mutation_file,'r') hResult = open(output,'w') if in_tumor_bam and in_normal_bam: tumor_samfile = pysam.Samfile(in_tumor_bam, "rb") normal_samfile = pysam.Samfile(in_normal_bam, "rb") if self.header_flag: header = srcfile.readline().rstrip('\n') newheader = ("RefNum_tumor\tAltNum_tumor\tOtherNum_tumor" + "\tRefNum_normal\tAltNum_normal\tOtherNum_normal") print >> hResult, (header +"\t"+ newheader) #### for line in srcfile: line = line.rstrip() itemlist = line.split('\t') # annovar input file (not zero-based number) chr, start, end, ref, alt = (itemlist[0], (int(itemlist[1]) - 1), int(itemlist[2]), itemlist[3], itemlist[4]) tumor_ref, tumor_alt, tumor_other, normal_ref, normal_alt, normal_other, log10_fisher_pvalue= ('---','---','---','---','---','---','---') self.makeTwoReference(chr,start,end,ref,alt,output + ".tmp.refalt.fa") if tumor_samfile.count(chr,start,end) < self.max_depth: # extract short reads from tumor sequence data around the candidate self.extractRead(tumor_samfile,chr,start,end,output + ".tmp.fa") # alignment tumor short reads to the reference and alternative sequences FNULL = open(os.devnull, 'w') retcode = subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], stdout = FNULL, stderr = subprocess.STDOUT) FNULL.close() # summarize alignment results tumor_ref, tumor_alt, tumor_other = self.summarizeRefAlt(output + ".tmp.psl") if normal_samfile.count(chr,start,end) < self.max_depth: # extract short reads from normal sequence data around the candidate self.extractRead(normal_samfile,chr,start,end,output + ".tmp.fa") # alignment normal short reads to the reference and alternative sequences FNULL = open(os.devnull, 'w') subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], stdout = FNULL, stderr = subprocess.STDOUT) FNULL.close() # summarize alignment results normal_ref, normal_alt, normal_other = self.summarizeRefAlt(output + ".tmp.psl") if tumor_ref != '---' and tumor_alt != '---' and normal_ref != '---' and normal_alt != '---': odds_ratio, fisher_pvalue = fisher(((int(tumor_ref),int(normal_ref)),(int(tumor_alt),int(normal_alt))), alternative='two-sided') log10_fisher_pvalue = '{0:.3f}'.format(float(self.math_log_fisher_pvalue(fisher_pvalue))) if ((tumor_alt == '---' or tumor_alt >= self.tumor_min_mismatch) and (normal_alt == '---' or normal_alt <= self.normal_max_mismatch)): print >> hResult, (line +"\t"+ str(tumor_ref) +"\t"+ str(tumor_alt) +"\t"+ str(tumor_other) +"\t"+ str(normal_ref) +"\t"+ str(normal_alt) +"\t"+ str(normal_other) +"\t"+ str(log10_fisher_pvalue)) #### tumor_samfile.close() normal_samfile.close() elif in_tumor_bam: tumor_samfile = pysam.Samfile(in_tumor_bam, "rb") if self.header_flag: header = srcfile.readline().rstrip('\n') newheader = ("RefNum_tumor\tAltNum_tumor\tOtherNum_tumor\t0.1\tratio\t0.9") print >> hResult, (header +"\t"+ newheader) for line in srcfile: line = line.rstrip() itemlist = line.split('\t') # annovar input file (not zero-based number) chr, start, end, ref, alt = (itemlist[0], (int(itemlist[1]) - 1), int(itemlist[2]), itemlist[3], itemlist[4]) tumor_ref, tumor_alt, tumor_other, beta_01, beta_mid, beta_09 = ('---','---','---','---','---','---') if tumor_samfile.count(chr,start,end) < self.max_depth: self.makeTwoReference(chr,start,end,ref,alt,output + ".tmp.refalt.fa") # extract short reads from tumor sequence data around the candidate self.extractRead(tumor_samfile,chr,start,end,output + ".tmp.fa") # alignment tumor short reads to the reference and alternative sequences FNULL = open(os.devnull, 'w') retcode = subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], stdout = FNULL, stderr = subprocess.STDOUT) FNULL.close() # summarize alignment results tumor_ref, tumor_alt, tumor_other = self.summarizeRefAlt(output + ".tmp.psl") beta_01 = '{0:.3f}'.format(float(scipy.special.btdtri( int(tumor_alt) + 1, int(tumor_ref) + 1, 0.1 ))) beta_mid = '{0:.3f}'.format(float( int(tumor_alt) + 1 ) / float( int(tumor_ref) + int(tumor_alt) + 2 )) beta_09 = '{0:.3f}'.format(float(scipy.special.btdtri( int(tumor_alt) + 1, int(tumor_ref) + 1, 0.9 ))) if (tumor_alt == '---' or tumor_alt >= self.tumor_min_mismatch): print >> hResult, (line +"\t"+ str(tumor_ref) +"\t"+ str(tumor_alt) +"\t"+ str(tumor_other) +"\t"+ str(beta_01) +"\t"+ str(beta_mid) +"\t"+ str(beta_09)) #### tumor_samfile.close() #### hResult.close() srcfile.close() #### if os.path.exists(output + ".tmp.refalt.fa"): os.unlink(output + ".tmp.refalt.fa") if os.path.exists(output + ".tmp.fa"): os.unlink(output + ".tmp.fa") if os.path.exists(output + ".tmp.psl"): os.unlink(output + ".tmp.psl")
def Pileup_out(mpileup, w, min_depth, min_variant_read, compare): # # mpileup format # # chr1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<& # # 0 chromosome, # 1 1-based coordinate, # 2 reference base, # 3 the number of reads covering the site (1) # 4 read bases (1) # 5 base qualities (1) # 6 the number of reads covering the site (2) # 7 read bases (2) # 8 base qualities (2) # global target global remove_chr global filter_quals # # Prepare mpileup data # # mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' ) if sys.version_info.major == 3: mp_list = mpileup.decode().strip('\n').split('\t') else: mp_list = mpileup.strip('\n').split('\t') mp_list_len = len(mp_list) ref_base_U = mp_list[2].upper() coordinate = mp_list[0:3] # # skip if depth is 0 # if mp_list[3] == '0' or (mp_list_len > 6 and mp_list[6] == '0'): # if int(mp_list[ 3 ]) < min_depth or ( mp_list_len > 6 and int(mp_list[ 6 ]) < min_depth ): return None ref_base_plus = mp_list[4].count('.') ref_base_minus = mp_list[4].count(',') ref_base_count = mp_list[4].count('.') + mp_list[4].count(',') ins_base_count = mp_list[4].count('+') del_base_count = mp_list[4].count('-') if (int(mp_list[3]) - ref_base_count + ins_base_count + del_base_count) < min_variant_read: return None if ref_base_U not in 'ACGTN': return None # # data_pair IDs # const.POS_CHR = 0 # const.POS_COORD = 1 # const.POS_REF = 2 # const.POS_DATA1 = 3 # const.POS_DATA2 = 4 # const.POS_FISHER_SNV = 5 # const.POS_FISHER_INS = 6 # const.POS_FISHER_DEL = 7 # const.POS_COUNT = 8 # data_pair = [ mp_list[0], int(mp_list[1]), mp_list[2], { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0, 'indel': util.AutoVivification() }, { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0, 'indel': util.AutoVivification() }, 1.0, 'N:1.0', 'N:1.0', 0 ] # # Loop for 2 bam file case # if compare: data_pair[const.POS_COUNT] = 2 input_list = [(const.POS_DATA1, mp_list[3], mp_list[4], mp_list[5]), (const.POS_DATA2, mp_list[6], mp_list[7], mp_list[8])] else: data_pair[const.POS_COUNT] = 1 input_list = [(const.POS_DATA1, mp_list[3], mp_list[4], mp_list[5])] # # position id, # mpileup output 4th row(number of read covering the site), # 5th row(read bases), # 6th row(base quality) # for data_id, depth, read_bases, qual_list in input_list: indel = util.AutoVivification() # # Look for deletion/insertion and save info in 'indel' dictionary # # ([\+\-])[0-9]+[ACGTNacgtn]+ # # m.group(1): + or - (deletion/insertion) # m.group(2): number of deletion/insertion # m.group(3): nucleotides # deleted = 0 iter = target.finditer(read_bases) for m in iter: site = m.start() type = m.group(1) num = m.group(2) bases = m.group(3)[0:int(num)] if bases.islower(): strand = ('-', '+') else: strand = ('+', '-') key = '\t'.join(coordinate + [bases.upper()]) if type in indel and key in indel[type]: indel[type][key][strand[0]] += 1 else: indel[type][key][strand[0]] = 1 indel[type][key][strand[1]] = 0 read_bases = read_bases[0:site - deleted] + read_bases[site + int(num) + len(num) + 1 - deleted:] deleted += 1 + len(num) + int(num) # # Remove '^.' and '$' # read_bases = remove_chr.sub('', read_bases) read_bases = read_bases.replace('$', '') # # Error check # if len(read_bases) != len(qual_list): logging.error("mpileup data is not good: {0}, {1}".format( mpileup, read_bases)) return None # # Count mismatch # mis_base_U = None if int(depth) >= min_depth: read_bases = read_bases.replace('.', ref_base_U) read_bases = read_bases.replace(',', ref_base_U.lower()) base_num = { "total_A": 0, "total_C": 0, "total_G": 0, "total_T": 0, "total_N": 0, "A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "a": 0, "c": 0, "g": 0, "t": 0, "n": 0 } # # Set data # data_pair[data_id]['bases'] = read_bases data_pair[data_id]['depth'] = int(depth) # # Count number # for nuc, qual in zip(read_bases, qual_list): if nuc in 'ATGCNacgtn': data_pair[data_id]['proper_read_depth_indel'] += 1 if nuc in 'ATGCN': data_pair[data_id]['proper_read_depth_indel_plus'] += 1 if nuc in 'acgtn': data_pair[data_id]['proper_read_depth_indel_minus'] += 1 if nuc in 'ATGCNacgtn' and not (qual in filter_quals): base_num[nuc] += 1 base_num['total_' + nuc.upper()] += 1 if nuc in 'ATGCatgc' and not (qual in filter_quals): data_pair[data_id]['proper_read_depth'] += 1 if nuc in 'ATGC' and not (qual in filter_quals): data_pair[data_id]['proper_read_depth_plus'] += 1 if nuc in 'atgc' and not (qual in filter_quals): data_pair[data_id]['proper_read_depth_minus'] += 1 # # InsDel # Beta distribution # for type in ('+', '-'): if type in indel: for key in indel[type].keys(): bases = key.split('\t')[3] data_pair[data_id]['indel'][type][bases]['+'] = indel[ type][key]['+'] data_pair[data_id]['indel'][type][bases]['-'] = indel[ type][key]['-'] indel_number = \ data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] = ( indel[ type ][ key ][ '-' ] + indel[ type ][ key ][ '+' ] ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.1' ] = \ scipy.special.btdtri( indel_number + 1, float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.1 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'mid' ] = \ ( indel_number + 1 ) / ( float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) + 2 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.9' ] = \ scipy.special.btdtri( indel_number + 1, int( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.9 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 's_ratio' ] = \ float( indel[ type ][ key ][ '+' ] ) / data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] # # skip if reference is 'N' # if ref_base_U != 'N' and int( data_pair[data_id]['proper_read_depth']) >= min_depth: ref_num = base_num['total_' + ref_base_U] mis_num = 0 for nuc in ('A', 'C', 'G', 'T'): data_pair[data_id][nuc] = base_num[nuc] tmp = nuc.lower() data_pair[data_id][tmp] = base_num[tmp] tmp = 'total_' + nuc data_pair[data_id][tmp] = base_num[tmp] if nuc != ref_base_U: if base_num[tmp] > mis_num: mis_num = base_num[tmp] mis_base_U = nuc if data_id == const.POS_DATA2 and data_pair[ const.POS_DATA1]['mis_base']: mis_num = base_num['total_' + data_pair[const.POS_DATA1]['mis_base']] mis_base_U = data_pair[const.POS_DATA1]['mis_base'] #### # # Calculate ratio # data_pair[data_id]['mis_rate'] = mis_num / float( data_pair[data_id]['proper_read_depth']) data_pair[data_id]['mis_base'] = mis_base_U if mis_base_U and (base_num[mis_base_U] + base_num[mis_base_U.lower()]) > 0: data_pair[data_id]['s_ratio'] = float( base_num[mis_base_U]) / (base_num[mis_base_U] + base_num[mis_base_U.lower()]) # else: # data_pair[ data_id ][ 's_ratio' ] = float(0) # # Beta distribution for SNV # data_pair[data_id]['0.1'] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.1) data_pair[data_id]['mid'] = (mis_num + 1) / float(ref_num + mis_num + 2) data_pair[data_id]['0.9'] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.9) data_pair[data_id]['mis_num'] = mis_num ### # # Fisher # # SNV # if (data_pair[const.POS_COUNT] == 2 and ref_base_U and data_pair[const.POS_DATA1]['mis_base'] and 'mid' in data_pair[const.POS_DATA1].keys() and 'mid' in data_pair[const.POS_DATA2].keys() and 'proper_read_depth' in data_pair[const.POS_DATA1].keys() and 'proper_read_depth' in data_pair[const.POS_DATA2].keys()): odds_ratio, fisher_pvalue = fisher( ((int(data_pair[const.POS_DATA1]['total_' + ref_base_U]), int(data_pair[const.POS_DATA2]['total_' + ref_base_U])), (int(data_pair[const.POS_DATA1][ 'total_' + data_pair[const.POS_DATA1]['mis_base']]), int(data_pair[const.POS_DATA2][ 'total_' + data_pair[const.POS_DATA1]['mis_base']]))), alternative='two-sided') data_pair[const.POS_FISHER_SNV] = math_log_fisher_pvalue(fisher_pvalue) # # INDEL # if (data_pair[const.POS_COUNT] == 2 and 'indel' in data_pair[const.POS_DATA1]): fisher_pvalue = None for type in data_pair[const.POS_DATA1]['indel']: for bases in data_pair[const.POS_DATA1]['indel'][type].keys(): # if type in data_pair[ const.POS_DATA1 ][ 'indel' ] and bases in data_pair[ const.POS_DATA1 ][ 'indel' ][ type ]: if not isinstance( data_pair[const.POS_DATA2]['indel'][type][bases] ['both'], int): data_pair[ const.POS_DATA2]['indel'][type][bases]['both'] = 0 data_pair[const.POS_DATA2]['indel'][type][bases]['+'] = 0 data_pair[const.POS_DATA2]['indel'][type][bases]['-'] = 0 if (data_pair[const.POS_DATA2]['proper_read_depth_indel'] >= data_pair[ const.POS_DATA2]['indel'][type][bases]['both'] and data_pair[const.POS_DATA1]['proper_read_depth_indel'] >= data_pair[ const.POS_DATA1]['indel'][type][bases]['both']): odds_ratio, fisher_pvalue = fisher( ((data_pair[const.POS_DATA1]['proper_read_depth_indel'] - data_pair[const.POS_DATA1]['indel'][type][bases] ['both'], data_pair[ const.POS_DATA1]['indel'][type][bases]['both']), (data_pair[const.POS_DATA2]['proper_read_depth_indel'] - data_pair[const.POS_DATA2]['indel'][type][bases] ['both'], data_pair[ const.POS_DATA2]['indel'][type][bases]['both'])), alternative='two-sided') if fisher_pvalue != None: if type == '+': data_id = const.POS_FISHER_INS elif type == '-': data_id = const.POS_FISHER_DEL if data_pair[data_id] == 'N:1.0': data_pair[data_id] = bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue)) else: data_pair[data_id] += ',' + bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue)) return data_pair
def fsometests(sumotudict, ftable, iddict, sumalldict, otu_list, whattest="man-y"): ''' This function consists of statistical tests, that calculate p-value for our data. This is my shiny shit castle of crap, which really lacks some order. Return orderdicts with pairs like otu:p-value: fisher/chi2(if sum from some sample more than 5)[0] ttest [1] kruskal[2] ''' otherdict = OrderedDict() sumalldict = sumalldict.values() for i in sumotudict.items(): other = map(operator.sub, sumalldict , i[1]) otherdict.update({i[0]:other}) leno = len(otu_list) otuwsa_od = OrderedDict() for i in otu_list: dpartbef = [a for a in iddict.values()] summm_1 = sumalldict[0] summm_2 = sumalldict[1] dpart1_l = [ftable.get_value_by_ids(i, a) for a in dpartbef[0]] dpart2_l = [ftable.get_value_by_ids(i, a) for a in dpartbef[1]] dpart1 = [a/summm_1 for a in dpart1_l] dpart2 = [a/summm_2 for a in dpart2_l] otuwsa_od.update({i:[dpart1,dpart2]}) if whattest == "chi2": fj=0 pdict = OrderedDict() for i in sumotudict.items(): fj+=1 sum = i[1] other = otherdict.get(i[0]) table = np.array([sum, other]) if sum[1] <= 5 or sum[0] <= 5: p = fisher(table)[1] pdict.update({i[0]:p}) else: p = chisq(table, lambda_="log-likelihood")[1] pdict.update({i[0]:p}) sys.stdout.write('\r') sys.stdout.write("fisher {}/{}".format(fj,leno)) sys.stdout.flush() elif whattest=="ttest": pdict = OrderedDict() j=0 for i in otu_list: j+=1 a = otuwsa_od.get(i)[0] b = otuwsa_od.get(i)[1] p = ttest(a,b)[1] if p != p: p = 1 pdict.update({i:p}) sys.stdout.write('\r') sys.stdout.write("ttest {}/{}".format(j,leno)) sys.stdout.flush() elif whattest=="man-y": pdict = OrderedDict() mj=0 for i in otu_list: mj+=1 a = otuwsa_od.get(i)[0] b = otuwsa_od.get(i)[1] p = man(a,b)[1] pdict.update({i:p}) sys.stdout.write('\r') sys.stdout.write("man-y {}/{}".format(mj,leno)) sys.stdout.flush() elif whattest=="kruscal": pdict = OrderedDict() kw=0 for i in otu_list: kw+=1 a = otuwsa_od.get(i)[0] b = otuwsa_od.get(i)[1] p = wilc(a,b)[1] pdict.update({i:p}) sys.stdout.write('\r') sys.stdout.write("kruskal {}/{}".format(kw,leno)) sys.stdout.flush() sys.stdout.write('\r') return pdict
def Pileup_out( mpileup, w, min_depth, min_variant_read, compare ): # # mpileup format # # chr1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<& # # 0 chromosome, # 1 1-based coordinate, # 2 reference base, # 3 the number of reads covering the site (1) # 4 read bases (1) # 5 base qualities (1) # 6 the number of reads covering the site (2) # 7 read bases (2) # 8 base qualities (2) # global target global remove_chr global filter_quals # # Prepare mpileup data # mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' ) mp_list_len = len( mp_list ) ref_base_U = mp_list[ 2 ].upper() coordinate = mp_list[ 0:3 ] # # skip if depth is 0 # if mp_list[ 3 ] == '0' or ( mp_list_len > 6 and mp_list[ 6 ] == '0' ): # if int(mp_list[ 3 ]) < min_depth or ( mp_list_len > 6 and int(mp_list[ 6 ]) < min_depth ): return None ref_base_plus = mp_list[ 4 ].count('.') ref_base_minus = mp_list[ 4 ].count(',') ref_base_count = mp_list[ 4 ].count('.') + mp_list[ 4 ].count(',') ins_base_count = mp_list[ 4 ].count('+') del_base_count = mp_list[ 4 ].count('-') if (int(mp_list[ 3 ]) - ref_base_count + ins_base_count + del_base_count) < min_variant_read: return None if ref_base_U not in 'ACGTN': return None # # data_pair IDs # POS_CHR = 0 # POS_COORD = 1 # POS_REF = 2 # POS_DATA1 = 3 # POS_DATA2 = 4 # POS_FISHER_SNV = 5 # POS_FISHER_INS = 6 # POS_FISHER_DEL = 7 # POS_COUNT = 8 # data_pair = [ mp_list[ 0 ], int( mp_list[ 1 ] ), mp_list[ 2 ], { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0,'indel': AutoVivification() }, { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0,'indel': AutoVivification() }, 1.0, 'N:1.0', 'N:1.0', 0 ] # # Loop for 2 bam file case # if compare: data_pair[ POS_COUNT ] = 2 input_list = [ ( POS_DATA1, mp_list[ 3 ], mp_list[ 4 ], mp_list[ 5 ] ), ( POS_DATA2, mp_list[ 6 ], mp_list[ 7 ], mp_list[ 8 ] ) ] else: data_pair[ POS_COUNT ] = 1 input_list = [ ( POS_DATA1, mp_list[ 3 ], mp_list[ 4 ], mp_list[ 5 ] ) ] # # position id, # mpileup output 4th row(number of read covering the site), # 5th row(read bases), # 6th row(base quality) # for data_id, depth, read_bases, qual_list in input_list: indel = AutoVivification() # # Look for deletion/insertion and save info in 'indel' dictionary # # ([\+\-])[0-9]+[ACGTNacgtn]+ # # m.group(1): + or - (deletion/insertion) # m.group(2): number of deletion/insertion # m.group(3): nucleotides # deleted = 0 iter = target.finditer( read_bases ) for m in iter: site = m.start() type = m.group( 1 ) num = m.group( 2 ) bases = m.group( 3 )[ 0:int( num ) ] if bases.islower(): strand = ( '-', '+' ) else: strand = ( '+', '-' ) key = '\t'.join( coordinate + [ bases.upper() ] ) if type in indel and key in indel[ type ]: indel[ type ][ key ][ strand[ 0 ] ] += 1 else: indel[ type ][ key ][ strand[ 0 ] ] = 1 indel[ type ][ key ][ strand[ 1 ] ] = 0 read_bases = read_bases[ 0:site - deleted ] + read_bases[ site + int(num) + len( num ) + 1 - deleted: ] deleted += 1 + len( num ) + int( num ) # # Remove '^.' and '$' # read_bases = remove_chr.sub( '', read_bases ) read_bases = read_bases.translate( None, '$' ) # # Error check # if len( read_bases ) != len( qual_list ): logging.error( "mpileup data is not good: {0}, {1}".format( mpileup, read_bases ) ) return None # # Count mismatch # mis_base_U = None if int( depth ) >= min_depth: read_bases = read_bases.replace( '.', ref_base_U ) read_bases = read_bases.replace( ',', ref_base_U.lower() ) base_num = { "total_A": 0, "total_C": 0, "total_G": 0, "total_T": 0, "total_N": 0, "A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "a": 0, "c": 0, "g": 0, "t": 0, "n": 0 } # # Set data # data_pair[ data_id ][ 'bases' ] = read_bases data_pair[ data_id ][ 'depth' ] = int( depth ) # # Count number # for nuc, qual in zip( read_bases, qual_list ): if nuc in 'ATGCNacgtn': data_pair[ data_id ][ 'proper_read_depth_indel' ] += 1 if nuc in 'ATGCN': data_pair[ data_id ][ 'proper_read_depth_indel_plus' ] += 1 if nuc in 'acgtn': data_pair[ data_id ][ 'proper_read_depth_indel_minus' ] += 1 if nuc in 'ATGCNacgtn' and not ( qual in filter_quals) : base_num[ nuc ] += 1 base_num[ 'total_' + nuc.upper() ] += 1 if nuc in 'ATGCatgc' and not ( qual in filter_quals): data_pair[ data_id ][ 'proper_read_depth' ] += 1 if nuc in 'ATGC' and not ( qual in filter_quals): data_pair[ data_id ][ 'proper_read_depth_plus' ] += 1 if nuc in 'atgc' and not ( qual in filter_quals): data_pair[ data_id ][ 'proper_read_depth_minus' ] += 1 # # InsDel # Beta distribution # for type in ( '+', '-' ): if type in indel: for key in indel[ type ].keys(): bases = key.split( '\t' )[ 3 ] data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '+' ] = indel[ type ][ key ][ '+' ] data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '-' ] = indel[ type ][ key ][ '-' ] indel_number = \ data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] = ( indel[ type ][ key ][ '-' ] + indel[ type ][ key ][ '+' ] ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.1' ] = \ scipy.special.btdtri( indel_number + 1, float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.1 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'mid' ] = \ ( indel_number + 1 ) / ( float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) + 2 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.9' ] = \ scipy.special.btdtri( indel_number + 1, int( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.9 ) data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 's_ratio' ] = \ float( indel[ type ][ key ][ '+' ] ) / data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] # # skip if reference is 'N' # if ref_base_U != 'N' and int( data_pair[ data_id ][ 'proper_read_depth' ] ) >= min_depth: ref_num = base_num[ 'total_' + ref_base_U ] mis_num = 0 for nuc in ( 'A', 'C', 'G', 'T' ): data_pair[ data_id ][ nuc ] = base_num[ nuc ] tmp = nuc.lower() data_pair[ data_id ][ tmp ] = base_num[ tmp ] tmp = 'total_' + nuc data_pair[ data_id ][ tmp ] = base_num[ tmp ] if nuc != ref_base_U: if base_num[ tmp ] > mis_num: mis_num = base_num[ tmp ] mis_base_U = nuc if data_id == POS_DATA2 and data_pair[ POS_DATA1 ][ 'mis_base' ]: mis_num = base_num[ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ] mis_base_U = data_pair[ POS_DATA1 ][ 'mis_base' ] #### # # Calculate ratio # data_pair[ data_id ][ 'mis_rate' ] = mis_num / float( data_pair[ data_id ][ 'proper_read_depth' ] ) data_pair[ data_id ][ 'mis_base' ] = mis_base_U if mis_base_U and ( base_num[ mis_base_U ] + base_num[ mis_base_U.lower() ] ) > 0: data_pair[ data_id ][ 's_ratio' ] = float( base_num[ mis_base_U ] ) / ( base_num[ mis_base_U ] + base_num[ mis_base_U.lower() ] ) # else: # data_pair[ data_id ][ 's_ratio' ] = float(0) # # Beta distribution for SNV # data_pair[ data_id ][ '0.1' ] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.1 ) data_pair[ data_id ][ 'mid' ] = ( mis_num + 1 ) / float( ref_num + mis_num + 2 ) data_pair[ data_id ][ '0.9' ] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.9 ) data_pair[ data_id ][ 'mis_num' ] = mis_num ### # # Fisher # # SNV # if ( data_pair[ POS_COUNT ] == 2 and ref_base_U and data_pair[ POS_DATA1 ][ 'mis_base' ] and 'mid' in data_pair[ POS_DATA1 ].keys() and 'mid' in data_pair[ POS_DATA2 ].keys() and 'proper_read_depth' in data_pair[ POS_DATA1 ].keys() and 'proper_read_depth' in data_pair[ POS_DATA2 ].keys() ): odds_ratio, fisher_pvalue = fisher( ( ( int( data_pair[ POS_DATA1 ][ 'total_' + ref_base_U ] ), int( data_pair[ POS_DATA2 ][ 'total_' + ref_base_U ] ) ), ( int( data_pair[ POS_DATA1 ][ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ] ), int( data_pair[ POS_DATA2 ][ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ] ) ) ), alternative='two-sided' ) data_pair[ POS_FISHER_SNV ] = math_log_fisher_pvalue(fisher_pvalue) # # INDEL # if ( data_pair[ POS_COUNT ] == 2 and 'indel' in data_pair[ POS_DATA1 ] ): fisher_pvalue = None for type in data_pair[ POS_DATA1 ][ 'indel' ]: for bases in data_pair[ POS_DATA1 ][ 'indel' ][ type ].keys(): # if type in data_pair[ POS_DATA1 ][ 'indel' ] and bases in data_pair[ POS_DATA1 ][ 'indel' ][ type ]: if not isinstance( data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ], int ): data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ] = 0 data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ '+' ] = 0 data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ '-' ] = 0 if (data_pair[ POS_DATA2 ][ 'proper_read_depth_indel' ] >= data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ] and data_pair[ POS_DATA1 ][ 'proper_read_depth_indel' ] >= data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ] ): odds_ratio, fisher_pvalue = fisher( ( ( data_pair[ POS_DATA1 ][ 'proper_read_depth_indel' ] - data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ], data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ] ), ( data_pair[ POS_DATA2 ][ 'proper_read_depth_indel' ] - data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ], data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ]) ), alternative='two-sided' ) if fisher_pvalue != None: if type == '+': data_id = POS_FISHER_INS elif type == '-': data_id = POS_FISHER_DEL if data_pair[ data_id ] == 'N:1.0': data_pair[ data_id ] = bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue) ) else: data_pair[ data_id ] += ',' + bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue) ) return data_pair
def fischers(db, antc, cons): obs = raMetricas.__tbContingencia(db, antc, cons)[0] return fisher(obs)[1]