def run_filter(args): sz_utils.check_if_files_exist(args.ac_file) fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') before, after = 0, 0 # number of SNPs before and after filteration with open(args.ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") before += 1 ref_base = tmp_line[2] alt_base = tmp_line[3] fail = 0 for i in range(len(tmp_line[4:])): fail = apply_filter(tmp_line[4:][i], fail, args) if fail: break if not fail: fOUT.write(line) after += 1 fOUT.close() ColorText().info("Number of SNPs before filtering: %d\n" % (before), "stderr") ColorText().info("Number of SNPs after filtering: %d\n" % (after), "stderr")
def run_filter(args): sz_utils.check_if_files_exist(args.ac_file) fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') before, after = 0, 0 # number of SNPs before and after filteration with open(args.ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") before += 1 ref_base = tmp_line[2] alt_base = tmp_line[3] fail = 0 for i in range(len(tmp_line[4:])): fail = apply_filter(tmp_line[4:][i], fail, args) if fail: break if not fail: fOUT.write(line) after += 1 fOUT.close() ColorText().info("Number of SNPs before filtering: %d\n" %(before), "stderr") ColorText().info("Number of SNPs after filtering: %d\n" %(after), "stderr")
def run_overlap(args): ''' getting SNPs identified from both pools ''' sz_utils.check_if_files_exist(args.file_a, args.file_b) snp_a = collections.defaultdict(list) with open(args.file_a, 'r') as fA: for line in fA: tmp_line = line.strip().split("\t") snp_a[int(tmp_line[1])] = tmp_line ColorText().info( "[poolseq_tk]: %d SNPs parsed from %s\n" % (len(snp_a), os.path.basename(args.file_a)), "stderr") sz_utils.make_dirs_if_necessary(args.out) num_overlapion = 0 with open(args.out, 'w') as fOUT: with open(args.file_b, 'r') as fB: for line in fB: tmp_line = line.strip().split("\t") if int(tmp_line[1]) in snp_a: num_overlapion += 1 fOUT.write("%s\t%s\n" % ("\t".join(snp_a[int( tmp_line[1])]), "\t".join(tmp_line[-4:]))) ColorText().info( "[poolseq_tk]: %d SNPs identified from both pools\n" % (num_overlapion), "stderr")
def run_biallelic(args): dPileups = syncPileups(args.pileups) sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') nZeroCov = 0 nSNPsKept = 0 nMulti = 0 for k in sorted(dPileups.iterkeys()): chr = k[0] pos = k[1] ref_base = dPileups[k][0] alt_base = dPileups[k][1] reads_bases = dPileups[k][2] if len(reads_bases) > 0: ref_count = reads_bases.count(ref_base) + \ reads_bases.count(ref_base.lower()) alt_count = reads_bases.count(alt_base) + \ reads_bases.count(alt_base.lower()) other_count = len(reads_bases) - ref_count - alt_count if float(other_count) / len(reads_bases) < 0.05: fOUT.write("%s\t%d\t%s\t%s\n" % (chr, pos, ref_base, alt_base)) nSNPsKept += 1 else: nMulti += 1 else: nZeroCov += 1 fOUT.close() print nSNPsKept print nMulti print nZeroCov
def run_merge(args): ''' combine allele counts across replicates ''' allele_counts = collections.defaultdict(list) data = collections.defaultdict(list) for ac_file in args.acs: sz_utils.check_if_files_exist(ac_file) ColorText().info("[poolseq_tk] reading and updating allele counts from %s ..." %(ac_file), "stderr") with open(ac_file) as fAC: for line in fAC: tmp_line = line.strip().split() pos = int(tmp_line[1]) if not pos in data: data[pos] = tmp_line[0:4] if not pos in allele_counts: allele_counts[pos] = map(int, tmp_line[4].split(':')) else: allele_counts[pos] = map(sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':')))) ColorText().info(" [done]\n", "stderr") # output to file fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting to %s ..." %(fOUT.name), "stderr") for pos in sorted(allele_counts.iterkeys()): fOUT.write("%s\t%s\n" %("\t".join(data[pos]), ":".join(map(str, allele_counts[pos])))) ColorText().info(" [done]\n", "stderr")
def main(): isnp = sys.argv[1] m1 = sys.argv[2] m2 = sys.argv[3] out = sys.argv[4] sz_utils.make_dirs_if_necessary(out) run_collapse(isnp, m1, m2, out)
def run_prepVCF(args): sz_utils.check_if_files_exist(args.infile) dfst = collections.defaultdict(list) if args.ifst: dfst = getFst(args.ifst, dfst) dfilters = getFilters(args.filters) sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') outVCFHeaders(args.samples, fOUT) with open(args.infile, "r") as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) refBase = tmp_line[2] altBase = tmp_line[3] pval = float(tmp_line[8]) corrPval = float(tmp_line[10]) ratio = float(tmp_line[-1]) fst = -1.0 if chr in dfst: for j in range(len(dfst[chr])): if pos == dfst[chr][j][0]: fst = float(dfst[chr][j][1]) dfst[chr].pop(j) break if "ratio" in dfilters: if ((dfilters["ratio"][0] == '<' and ratio >= dfilters["ratio"][1]) or dfilters["ratio"][0] == '>' and ratio <= dfilters["ratio"][1]): continue if "pval" in dfilters: # fix later pass if "corrPval" in dfilters: # fix later pass fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t" % (chr, pos, refBase, altBase)) if fst == -1.0: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t" % (pval, corrPval, ratio)) else: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t" % (pval, corrPval, ratio, fst)) fOUT.write("GT:Table") poolIndex = 1 for i in range(len(tmp_line[4:-3])): table = tmp_line[4:-3][i].replace(':', '-') fOUT.write("\t./.:%s" % (table)) fOUT.write("\n") fOUT.close()
def run_prepVCF(args): sz_utils.check_if_files_exist(args.infile) dfst = collections.defaultdict(list) if args.ifst: dfst = getFst(args.ifst, dfst) dfilters = getFilters(args.filters) sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') outVCFHeaders(args.samples, fOUT) with open(args.infile, "r") as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) refBase = tmp_line[2] altBase = tmp_line[3] pval = float(tmp_line[8]) corrPval = float(tmp_line[10]) ratio = float(tmp_line[-1]) fst = -1.0 if chr in dfst: for j in range(len(dfst[chr])): if pos == dfst[chr][j][0]: fst = float(dfst[chr][j][1]) dfst[chr].pop(j) break if "ratio" in dfilters: if ((dfilters["ratio"][0] == '<' and ratio >= dfilters["ratio"][1]) or dfilters["ratio"][0] == '>' and ratio <= dfilters["ratio"][1]): continue if "pval" in dfilters: # fix later pass if "corrPval" in dfilters: # fix later pass fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t" %(chr, pos, refBase, altBase)) if fst == -1.0: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t" %(pval, corrPval, ratio)) else: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t" %(pval, corrPval, ratio, fst)) fOUT.write("GT:Table") poolIndex = 1 for i in range(len(tmp_line[4:-3])): table = tmp_line[4:-3][i].replace(':', '-') fOUT.write("\t./.:%s" %(table)) fOUT.write("\n") fOUT.close()
def run_view(args): check_if_files_exist(args.ipileup) make_dirs_if_necessary(args.out) dSNPs = getSNPs(args.isnp) fOUT = open(args.out, 'w') # nRemoved = 0 with open(args.ipileup, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) in dSNPs: cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() alt_base = dSNPs[chr, pos][1] if ref_base == dSNPs[chr, pos][0]: if cov > 0: reads_bases = tmp_line[4] reads_bases_parsed = parseReadsBases( ref_base, alt_base, reads_bases) fOUT.write( "%s\t%d\t%s\t%s\t%s\n" % (chr, pos, ref_base, alt_base, reads_bases_parsed)) # reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base) # the following is a checkup on other alleles # at this moment this checkup is inactive # number of alleles (SNPs, Indels) other than the alternative allele # nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values()) # if (nReadsBases == nRefBases or # (nMultiBases)/float(nReadsBases) <= 0.05): # out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, # reads_bases_parsed)) # else: # nRemoved += 1 # print pos, ref_base, alt_base, reads_bases, reads_bases_parsed # print dMultiBases # print dIndels # print else: sys.stderr.write("reference base not consistent\n") sys.stderr.write(line) sys.exit() del dSNPs[chr, pos] fOUT.close()
def run_view(args): check_if_files_exist(args.ipileup) make_dirs_if_necessary(args.out) dSNPs = getSNPs(args.isnp) fOUT = open(args.out, 'w') # nRemoved = 0 with open(args.ipileup, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) in dSNPs: cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() alt_base = dSNPs[chr, pos][1] if ref_base == dSNPs[chr, pos][0]: if cov > 0: reads_bases = tmp_line[4] reads_bases_parsed = parseReadsBases(ref_base, alt_base, reads_bases) fOUT.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, reads_bases_parsed)) # reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base) # the following is a checkup on other alleles # at this moment this checkup is inactive # number of alleles (SNPs, Indels) other than the alternative allele # nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values()) # if (nReadsBases == nRefBases or # (nMultiBases)/float(nReadsBases) <= 0.05): # out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, # reads_bases_parsed)) # else: # nRemoved += 1 # print pos, ref_base, alt_base, reads_bases, reads_bases_parsed # print dMultiBases # print dIndels # print else: sys.stderr.write("reference base not consistent\n") sys.stderr.write(line) sys.exit() del dSNPs[chr, pos] fOUT.close()
def run_merge(args): ''' combine allele counts across replicates ''' allele_counts = collections.defaultdict(list) data = collections.defaultdict(list) for ac_file in args.acs: sz_utils.check_if_files_exist(ac_file) ColorText().info( "[poolseq_tk] reading and updating allele counts from %s ..." % (ac_file), "stderr") with open(ac_file) as fAC: for line in fAC: tmp_line = line.strip().split() pos = int(tmp_line[1]) if not pos in data: data[pos] = tmp_line[0:4] if not pos in allele_counts: allele_counts[pos] = map(int, tmp_line[4].split(':')) else: allele_counts[pos] = map( sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':')))) ColorText().info(" [done]\n", "stderr") # output to file fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting to %s ..." % (fOUT.name), "stderr") for pos in sorted(allele_counts.iterkeys()): fOUT.write( "%s\t%s\n" % ("\t".join(data[pos]), ":".join(map(str, allele_counts[pos])))) ColorText().info(" [done]\n", "stderr")
def run_overlap(args): ''' getting SNPs identified from both pools ''' sz_utils.check_if_files_exist(args.file_a, args.file_b) snp_a = collections.defaultdict(list) with open(args.file_a, 'r') as fA: for line in fA: tmp_line = line.strip().split("\t") snp_a[int(tmp_line[1])] = tmp_line ColorText().info("[poolseq_tk]: %d SNPs parsed from %s\n" %(len(snp_a), os.path.basename(args.file_a)), "stderr") sz_utils.make_dirs_if_necessary(args.out) num_overlapion = 0 with open(args.out, 'w') as fOUT: with open(args.file_b, 'r') as fB: for line in fB: tmp_line = line.strip().split("\t") if int(tmp_line[1]) in snp_a: num_overlapion += 1 fOUT.write("%s\t%s\n" %("\t".join(snp_a[int(tmp_line[1])]), "\t".join(tmp_line[-4:]))) ColorText().info("[poolseq_tk]: %d SNPs identified from both pools\n" %(num_overlapion), "stderr")
def run_collapse(args): ''' Given two pileup files of the same region, like 2l+ and 2la, collapse the pileups at each corresponding SNP Some SNPs are not reported in one or the other pileup file. A full list of SNP positions are required ''' m1_base = os.path.basename(args.m1) m2_base = os.path.basename(args.m2) # first, getting the full list of SNPs dSNPs = get_SNPs(args.snps) # second, reading each of the pileup files chr1, dM1 = read_mpileup(args.m1, args.offset1) chr2, dM2 = read_mpileup(args.m2, args.offset2) ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m1_base, len(dM1)), "stderr") ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m2_base, len(dM2)), "stderr") fOUT = None if args.out != sys.stdout: outdir = os.path.dirname(os.path.realpath(args.out)) sz_utils.make_dirs_if_necessary(outdir) fOUT = open(args.out, 'w') else: fOUT = args.out ColorText().info("[poolseq_tk]: collapsing mpileups %s and %s ..." %(m1_base, m2_base), "stderr") for pos in sorted(dSNPs.iterkeys()): reads_bases_collapsed = "" if pos in dM1 and pos in dM2: ''' dSNPs[pos][0]: ref base of m1 pileup dSNPs[pos][1]: ref base of m2 pileup dM1[pos][0]: ref base of m1 pileup dM2[pos][1]: ref base of m2 pileup ''' if dSNPs[pos][0] == dM1[pos][0] and dSNPs[pos][1] == dM2[pos][0]: reads_bases_collapsed = parseReadsBases(dM1[pos][0], dM2[pos][0], dM1[pos][1]) reads_bases_collapsed += parseReadsBases(dM2[pos][0], dM1[pos][0], dM2[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) else: # this should bark if the same sites having different states ColorText().error("SNP position: %d %s %s\t\tMpileup position: %d %s %s\n" %(pos, dSNPs[pos][0], dSNPs[pos][1], pos, dM1[pos][0], dM2[pos][0]), "stderr") # SNPS missed in both pileup file elif pos not in dM1 and pos not in dM2: fOUT.write("%s/%s\t%d\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1])) # SNPs in m1 pileup file but not in m2 elif pos in dM1 and pos not in dM2: reads_bases_collapsed = parseReadsBases(dM1[pos][0], dSNPs[pos][1], dM1[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) # SNPs in m2 pileup file but not in m1 elif pos not in dM1 and pos in dM2: reads_bases_collapsed = parseReadsBases(dM2[pos][0], dSNPs[pos][0], dM2[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) ColorText().info(" [done]\n", "stderr") fOUT.close()
def run_fisher(args): ''' run Fisher's Exact test ''' sz_utils.make_dirs_if_necessary(args.outp) sz_utils.check_if_files_exist(args.ac_file) tables = sz_utils._count2table(args.ac_file)[0] task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc, task_q, result_q, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: pvals, odds_ratios, log10_pvals = {}, {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) log10_pval = tmp_line[4] if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio if (chr, pos) not in log10_pvals: log10_pvals[chr, pos] = log10_pval os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info( "[poolseq_tk]: Running Fisher's Exact tests successfully\n", "stderr") # correcting raw p-values and make QQ plots ColorText().info( "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." % (args.adj_method, args.adj_cutoff * 100), "stderr") raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") ColorText().info( "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".fisher.all" out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100) out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff * 100) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[k] log_pval = log10_pvals[k] odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[k] > 1) or (args.oddsr_direction == "less" and odds_ratios[k] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
def making_plot(args): ''' making Q-Q plot and Manhattan plot ''' # install qqman package if not installed if not rpackages.isinstalled("qqman"): rutils = rpackages.importr('utils') rutils.chooseCRANmirror(ind=84) rutils.install_packages("qqman") # get pvalues ColorText().info("[poolseq_tk]: Extracting P-Values ... ", "stderr") data = collections.defaultdict() chrs = [] pvals, adjust_pvals = {}, {} nchr = 0 with open(args.input, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if chr not in chrs: chrs.append(chr) nchr += 1 data[chr, pos] = nchr pvals[chr, pos] = float(tmp_line[8]) ColorText().info(" [done]\n", "stderr") # get FDR cutoff using BH if not provided through command line pcutoff = 0.0 if not args.pcutoff: ColorText().info( "[poolseq_tk]: Getting p-value cutoff at FDR %d%%: " % (args.fdrlevel * 100), "stderr") pcutoff = sz_utils.getFDR_BH(pvals, args.fdrlevel) ColorText().info("%.5e\n" % (pcutoff), "stderr") else: pcutoff = args.pcutoff ColorText().info( "[poolseq_tk]: p-value cutoff provided: %.5e\n" % (pcutoff), "stderr") # get SNPs to highlight snps_to_highlight = [] if args.highlight_snps: ColorText().info( "[poolseq_tk]: Getting SNPs to be highlighed in Manhattan plot ... ", "stderr") with open(args.highlight_snps, 'r') as fHIGHLIGHT: for line in fHIGHLIGHT: tmp_line = line.strip().split("\t") snps_to_highlight.append('_'.join(tmp_line[:2])) ColorText().info(" [done]\n", "stderr") if args.pdf: out_qqplot = args.outp + ".qqplot.pdf" out_manhattan = args.outp + ".manhattan.pdf" elif args.png: # save to PNG probably wont work out_qqplot = args.outp + ".qqplot.png" out_manhattan = args.outp + ".manhattan.png" sz_utils.make_dirs_if_necessary(out_qqplot, out_manhattan) grdevices = rpackages.importr('grDevices') raw_pvals_vector = robjects.FloatVector( [pvals[k] for k in sorted(pvals.iterkeys())]) ColorText().info("[poolseq_tk]: Making Q-Q plot ...", "stderr") make_qqplots(grdevices, raw_pvals_vector, out_qqplot, args.qqtitle) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Making Manhattan plot ...", "stderr") make_manhattan(grdevices, data, raw_pvals_vector, snps_to_highlight, pcutoff, out_manhattan, args.mantitle, args.manx, args.manxlim) ColorText().info(" [done]\n", "stderr")
def run_count(args): ''' Counting alleles at each SNP in the given pileup files ''' dPos = {} if args.pos: ColorText().info("[poolseq_tk] reading SNPs positions:", "stderr") with open(args.pos, 'r') as fPOS: for line in fPOS: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) not in dPos: dPos[chr, pos] = 1 ColorText().info(" %d\n" % (len(dPos)), "stderr") else: ColorText().info( "[poolseq_tk] no SNP positions provided ... [skipped]\n", "stderr") ac = collections.defaultdict(tuple) for pileup in args.pileups: sz_utils.check_if_files_exist(pileup) nsnps = 0 ColorText().info( "[poolseq_tk] counting alleles in %s:" % (os.path.basename(pileup)), "stderr") with open(pileup, 'r') as fMPILEUP: for line in fMPILEUP: nsnps += 1 tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (((chr, pos) in dPos and args.pos) or (len(dPos) == 0 and not args.pos)): ref_base = tmp_line[2] alt_base = tmp_line[3] nRefAlleles, nAltAlleles = 0, 0 if len(tmp_line) == 5: nRefAlleles = tmp_line[-1].count(ref_base) + \ tmp_line[-1].count(ref_base.lower()) nAltAlleles = tmp_line[-1].count(alt_base) + \ tmp_line[-1].count(alt_base.lower()) if (chr, pos) not in ac: ac[chr, pos] = [ ref_base, alt_base, str(nRefAlleles), str(nAltAlleles) ] else: ac[chr, pos] += [str(nRefAlleles), str(nAltAlleles)] ColorText().info(" %d SNPs parsed\n" % (nsnps), "stderr") fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting allele counts to table ...", "stderr") for k in sorted(ac.iterkeys()): chr = k[0] pos = k[1] i = 2 if len(ac[k][i:]) == 2 * len(args.pileups): fOUT.write("%s\t%d\t%s" % (chr, pos, "\t".join(ac[k][0:2]))) while i <= len(ac[k]) - 4: fOUT.write("\t%s" % (":".join(ac[k][i:i + 4]))) i += 4 fOUT.write("\n") ColorText().info(" [done]\n", "stderr") fOUT.close()
def run_cmh(args): ''' run Cochran-Mantel-Hasenzle test ''' sz_utils.make_dirs_if_necessary(args.outp) allele_counts = {} pvals = {} tables = collections.defaultdict(list) ntests = 0 tables, ntables_per_snp = sz_utils._count2table(args.table_file) ColorText().info("[poolseq_tk]: %d tables prepared\n" %(len(tables)), "stderr") task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc,task_q, result_q, ntables_per_snp, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) # waiting for all tasks to be finished try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: # merge results pvals, odds_ratios = {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info("[poolseq_tk]: Running CMH tests successfully\n", "stderr") # correcting raw p-values ColorText().info("[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." %(args.adj_method, args.adj_cutoff*100), "stderr") raw_pvals = [pvals[chr, pos] for chr, pos in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") pcutoff = sz_utils.getFDR_BH(pvals, args.adj_cutoff) ColorText().info("[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" %(pcutoff), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".cmh.all" out_fdr = args.outp + ".cmh.fdr%d" %(args.adj_cutoff*100) out_expect = args.outp + ".cmh.fdr%d.expect" %(args.adj_cutoff*100) sz_utils.make_dirs_if_necessary(out_all, out_fdr) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[chr, pos] log_pval = None if raw_pval == 0.0: log_pval = "Inf" elif raw_pval == "Nan": raw_pval = 1.0 log_pval = 0.0 else: log_pval = -1 * math.log10(raw_pval) odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[chr, pos] > 1) or (args.oddsr_direction == "less" and odds_ratios[chr, pos] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")