def run_filter(args): sz_utils.check_if_files_exist(args.ac_file) fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') before, after = 0, 0 # number of SNPs before and after filteration with open(args.ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") before += 1 ref_base = tmp_line[2] alt_base = tmp_line[3] fail = 0 for i in range(len(tmp_line[4:])): fail = apply_filter(tmp_line[4:][i], fail, args) if fail: break if not fail: fOUT.write(line) after += 1 fOUT.close() ColorText().info("Number of SNPs before filtering: %d\n" % (before), "stderr") ColorText().info("Number of SNPs after filtering: %d\n" % (after), "stderr")
def run_merge(args): ''' combine allele counts across replicates ''' allele_counts = collections.defaultdict(list) data = collections.defaultdict(list) for ac_file in args.acs: sz_utils.check_if_files_exist(ac_file) ColorText().info("[poolseq_tk] reading and updating allele counts from %s ..." %(ac_file), "stderr") with open(ac_file) as fAC: for line in fAC: tmp_line = line.strip().split() pos = int(tmp_line[1]) if not pos in data: data[pos] = tmp_line[0:4] if not pos in allele_counts: allele_counts[pos] = map(int, tmp_line[4].split(':')) else: allele_counts[pos] = map(sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':')))) ColorText().info(" [done]\n", "stderr") # output to file fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting to %s ..." %(fOUT.name), "stderr") for pos in sorted(allele_counts.iterkeys()): fOUT.write("%s\t%s\n" %("\t".join(data[pos]), ":".join(map(str, allele_counts[pos])))) ColorText().info(" [done]\n", "stderr")
def run_filter(args): sz_utils.check_if_files_exist(args.ac_file) fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') before, after = 0, 0 # number of SNPs before and after filteration with open(args.ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") before += 1 ref_base = tmp_line[2] alt_base = tmp_line[3] fail = 0 for i in range(len(tmp_line[4:])): fail = apply_filter(tmp_line[4:][i], fail, args) if fail: break if not fail: fOUT.write(line) after += 1 fOUT.close() ColorText().info("Number of SNPs before filtering: %d\n" %(before), "stderr") ColorText().info("Number of SNPs after filtering: %d\n" %(after), "stderr")
def run_overlap(args): ''' getting SNPs identified from both pools ''' sz_utils.check_if_files_exist(args.file_a, args.file_b) snp_a = collections.defaultdict(list) with open(args.file_a, 'r') as fA: for line in fA: tmp_line = line.strip().split("\t") snp_a[int(tmp_line[1])] = tmp_line ColorText().info( "[poolseq_tk]: %d SNPs parsed from %s\n" % (len(snp_a), os.path.basename(args.file_a)), "stderr") sz_utils.make_dirs_if_necessary(args.out) num_overlapion = 0 with open(args.out, 'w') as fOUT: with open(args.file_b, 'r') as fB: for line in fB: tmp_line = line.strip().split("\t") if int(tmp_line[1]) in snp_a: num_overlapion += 1 fOUT.write("%s\t%s\n" % ("\t".join(snp_a[int( tmp_line[1])]), "\t".join(tmp_line[-4:]))) ColorText().info( "[poolseq_tk]: %d SNPs identified from both pools\n" % (num_overlapion), "stderr")
def read_mpileup(mpileup_file, offset): ''' read certain columns in a pileup file into a dictionary of tuple ''' ColorText().info("[poolseq_tk]: reading %s ..." % (mpileup_file), "stderr") mpileup_info = collections.defaultdict(tuple) chr = "" sz_utils.check_if_files_exist(mpileup_file) with open(mpileup_file, 'r') as fMPILEUP: for line in fMPILEUP: tmp_line = line.strip().split("\t") chr = tmp_line[0] ''' key: SNP position in integer value: a tuple with two elements 1) ref base at the position 2) reads bases covering that position if the coverage at that position > 0 else N/A ''' # if int(tmp_line[3]) == 0: # the forth column: coverage at a position # mpileup_info[int(tmp_line[1])+offset] = (tmp_line[2].upper(), "N/A") if int(tmp_line[3]) > 0: mpileup_info[int(tmp_line[1]) + offset] = (tmp_line[2].upper(), tmp_line[4]) ColorText().info(" [done]\n", "stderr") return chr, mpileup_info
def run_prepVCF(args): sz_utils.check_if_files_exist(args.infile) dfst = collections.defaultdict(list) if args.ifst: dfst = getFst(args.ifst, dfst) dfilters = getFilters(args.filters) sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') outVCFHeaders(args.samples, fOUT) with open(args.infile, "r") as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) refBase = tmp_line[2] altBase = tmp_line[3] pval = float(tmp_line[8]) corrPval = float(tmp_line[10]) ratio = float(tmp_line[-1]) fst = -1.0 if chr in dfst: for j in range(len(dfst[chr])): if pos == dfst[chr][j][0]: fst = float(dfst[chr][j][1]) dfst[chr].pop(j) break if "ratio" in dfilters: if ((dfilters["ratio"][0] == '<' and ratio >= dfilters["ratio"][1]) or dfilters["ratio"][0] == '>' and ratio <= dfilters["ratio"][1]): continue if "pval" in dfilters: # fix later pass if "corrPval" in dfilters: # fix later pass fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t" % (chr, pos, refBase, altBase)) if fst == -1.0: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t" % (pval, corrPval, ratio)) else: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t" % (pval, corrPval, ratio, fst)) fOUT.write("GT:Table") poolIndex = 1 for i in range(len(tmp_line[4:-3])): table = tmp_line[4:-3][i].replace(':', '-') fOUT.write("\t./.:%s" % (table)) fOUT.write("\n") fOUT.close()
def run_prepVCF(args): sz_utils.check_if_files_exist(args.infile) dfst = collections.defaultdict(list) if args.ifst: dfst = getFst(args.ifst, dfst) dfilters = getFilters(args.filters) sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') outVCFHeaders(args.samples, fOUT) with open(args.infile, "r") as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) refBase = tmp_line[2] altBase = tmp_line[3] pval = float(tmp_line[8]) corrPval = float(tmp_line[10]) ratio = float(tmp_line[-1]) fst = -1.0 if chr in dfst: for j in range(len(dfst[chr])): if pos == dfst[chr][j][0]: fst = float(dfst[chr][j][1]) dfst[chr].pop(j) break if "ratio" in dfilters: if ((dfilters["ratio"][0] == '<' and ratio >= dfilters["ratio"][1]) or dfilters["ratio"][0] == '>' and ratio <= dfilters["ratio"][1]): continue if "pval" in dfilters: # fix later pass if "corrPval" in dfilters: # fix later pass fOUT.write("%s\t%s\t.\t%s\t%s\t.\t.\t" %(chr, pos, refBase, altBase)) if fst == -1.0: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f\t" %(pval, corrPval, ratio)) else: fOUT.write("pval=%.5g;corrPval=%.5f;ratio=%.5f;fst=%.5f\t" %(pval, corrPval, ratio, fst)) fOUT.write("GT:Table") poolIndex = 1 for i in range(len(tmp_line[4:-3])): table = tmp_line[4:-3][i].replace(':', '-') fOUT.write("\t./.:%s" %(table)) fOUT.write("\n") fOUT.close()
def get_SNPs(snps_file): ''' read the SNP positions into a dictionary of tuple ''' dSNPs = collections.defaultdict(tuple) sz_utils.check_if_files_exist(snps_file) with open(snps_file, 'r') as fSNP: for line in fSNP: if not line.startswith('#'): tmp_line = line.strip().split("\t") pos = int(tmp_line[0]) ref_base = tmp_line[1] alt_base = tmp_line[2] if pos not in dSNPs: dSNPs[pos] = (ref_base, alt_base) return dSNPs
def run_view(args): check_if_files_exist(args.ipileup) make_dirs_if_necessary(args.out) dSNPs = getSNPs(args.isnp) fOUT = open(args.out, 'w') # nRemoved = 0 with open(args.ipileup, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) in dSNPs: cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() alt_base = dSNPs[chr, pos][1] if ref_base == dSNPs[chr, pos][0]: if cov > 0: reads_bases = tmp_line[4] reads_bases_parsed = parseReadsBases( ref_base, alt_base, reads_bases) fOUT.write( "%s\t%d\t%s\t%s\t%s\n" % (chr, pos, ref_base, alt_base, reads_bases_parsed)) # reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base) # the following is a checkup on other alleles # at this moment this checkup is inactive # number of alleles (SNPs, Indels) other than the alternative allele # nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values()) # if (nReadsBases == nRefBases or # (nMultiBases)/float(nReadsBases) <= 0.05): # out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, # reads_bases_parsed)) # else: # nRemoved += 1 # print pos, ref_base, alt_base, reads_bases, reads_bases_parsed # print dMultiBases # print dIndels # print else: sys.stderr.write("reference base not consistent\n") sys.stderr.write(line) sys.exit() del dSNPs[chr, pos] fOUT.close()
def read_mpileup(mpileup_file, offset): ''' read certain columns in a pileup file into a dictionary of tuple ''' ColorText().info("[poolseq_tk]: reading %s ..." %(mpileup_file), "stderr") dMpileups = collections.defaultdict(tuple) chr = "" sz_utils.check_if_files_exist(mpileup_file) with open(mpileup_file, 'r') as fMPILEUP: for line in fMPILEUP: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() if cov > 0: reads_bases = tmp_line[4] dMpileups[pos+offset] = (ref_base, reads_bases) ColorText().info(" [done]\n", "stderr") return chr, dMpileups
def run_view(args): check_if_files_exist(args.ipileup) make_dirs_if_necessary(args.out) dSNPs = getSNPs(args.isnp) fOUT = open(args.out, 'w') # nRemoved = 0 with open(args.ipileup, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) in dSNPs: cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() alt_base = dSNPs[chr, pos][1] if ref_base == dSNPs[chr, pos][0]: if cov > 0: reads_bases = tmp_line[4] reads_bases_parsed = parseReadsBases(ref_base, alt_base, reads_bases) fOUT.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, reads_bases_parsed)) # reads_bases_parsed, nReadsBases, nRefBases, dMultiBases, dIndels = parseReadsBases(reads_bases, ref_base, alt_base) # the following is a checkup on other alleles # at this moment this checkup is inactive # number of alleles (SNPs, Indels) other than the alternative allele # nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values()) # if (nReadsBases == nRefBases or # (nMultiBases)/float(nReadsBases) <= 0.05): # out.write("%s\t%d\t%s\t%s\t%s\n" %(chr, pos, ref_base, alt_base, # reads_bases_parsed)) # else: # nRemoved += 1 # print pos, ref_base, alt_base, reads_bases, reads_bases_parsed # print dMultiBases # print dIndels # print else: sys.stderr.write("reference base not consistent\n") sys.stderr.write(line) sys.exit() del dSNPs[chr, pos] fOUT.close()
def getSNPs(isnp): ''' getting polymorphic sties from file file format: 1. chr name 2. pos 3. ref allele 4. alt allele ''' check_if_files_exist(isnp) dSNPs = collections.defaultdict(tuple) with open(isnp, 'r') as fSNPS: for line in fSNPS: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) refBase = tmp_line[2] altBase = tmp_line[3] if not (chr, pos) in dSNPs: dSNPs[chr, pos] = (refBase, altBase) return dSNPs
def run_merge(args): ''' combine allele counts across replicates ''' allele_counts = collections.defaultdict(list) data = collections.defaultdict(list) for ac_file in args.acs: sz_utils.check_if_files_exist(ac_file) ColorText().info( "[poolseq_tk] reading and updating allele counts from %s ..." % (ac_file), "stderr") with open(ac_file) as fAC: for line in fAC: tmp_line = line.strip().split() pos = int(tmp_line[1]) if not pos in data: data[pos] = tmp_line[0:4] if not pos in allele_counts: allele_counts[pos] = map(int, tmp_line[4].split(':')) else: allele_counts[pos] = map( sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':')))) ColorText().info(" [done]\n", "stderr") # output to file fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting to %s ..." % (fOUT.name), "stderr") for pos in sorted(allele_counts.iterkeys()): fOUT.write( "%s\t%s\n" % ("\t".join(data[pos]), ":".join(map(str, allele_counts[pos])))) ColorText().info(" [done]\n", "stderr")
def run_overlap(args): ''' getting SNPs identified from both pools ''' sz_utils.check_if_files_exist(args.file_a, args.file_b) snp_a = collections.defaultdict(list) with open(args.file_a, 'r') as fA: for line in fA: tmp_line = line.strip().split("\t") snp_a[int(tmp_line[1])] = tmp_line ColorText().info("[poolseq_tk]: %d SNPs parsed from %s\n" %(len(snp_a), os.path.basename(args.file_a)), "stderr") sz_utils.make_dirs_if_necessary(args.out) num_overlapion = 0 with open(args.out, 'w') as fOUT: with open(args.file_b, 'r') as fB: for line in fB: tmp_line = line.strip().split("\t") if int(tmp_line[1]) in snp_a: num_overlapion += 1 fOUT.write("%s\t%s\n" %("\t".join(snp_a[int(tmp_line[1])]), "\t".join(tmp_line[-4:]))) ColorText().info("[poolseq_tk]: %d SNPs identified from both pools\n" %(num_overlapion), "stderr")
def run_count(args): ''' Counting alleles at each SNP in the given pileup files ''' dPos = {} if args.pos: ColorText().info("[poolseq_tk] reading SNPs positions:", "stderr") with open(args.pos, 'r') as fPOS: for line in fPOS: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) not in dPos: dPos[chr, pos] = 1 ColorText().info(" %d\n" % (len(dPos)), "stderr") else: ColorText().info( "[poolseq_tk] no SNP positions provided ... [skipped]\n", "stderr") ac = collections.defaultdict(tuple) for pileup in args.pileups: sz_utils.check_if_files_exist(pileup) nsnps = 0 ColorText().info( "[poolseq_tk] counting alleles in %s:" % (os.path.basename(pileup)), "stderr") with open(pileup, 'r') as fMPILEUP: for line in fMPILEUP: nsnps += 1 tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (((chr, pos) in dPos and args.pos) or (len(dPos) == 0 and not args.pos)): ref_base = tmp_line[2] alt_base = tmp_line[3] nRefAlleles, nAltAlleles = 0, 0 if len(tmp_line) == 5: nRefAlleles = tmp_line[-1].count(ref_base) + \ tmp_line[-1].count(ref_base.lower()) nAltAlleles = tmp_line[-1].count(alt_base) + \ tmp_line[-1].count(alt_base.lower()) if (chr, pos) not in ac: ac[chr, pos] = [ ref_base, alt_base, str(nRefAlleles), str(nAltAlleles) ] else: ac[chr, pos] += [str(nRefAlleles), str(nAltAlleles)] ColorText().info(" %d SNPs parsed\n" % (nsnps), "stderr") fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting allele counts to table ...", "stderr") for k in sorted(ac.iterkeys()): chr = k[0] pos = k[1] i = 2 if len(ac[k][i:]) == 2 * len(args.pileups): fOUT.write("%s\t%d\t%s" % (chr, pos, "\t".join(ac[k][0:2]))) while i <= len(ac[k]) - 4: fOUT.write("\t%s" % (":".join(ac[k][i:i + 4]))) i += 4 fOUT.write("\n") ColorText().info(" [done]\n", "stderr") fOUT.close()
def run_fisher(args): ''' run Fisher's Exact test ''' sz_utils.make_dirs_if_necessary(args.outp) sz_utils.check_if_files_exist(args.ac_file) tables = sz_utils._count2table(args.ac_file)[0] task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc, task_q, result_q, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: pvals, odds_ratios, log10_pvals = {}, {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) log10_pval = tmp_line[4] if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio if (chr, pos) not in log10_pvals: log10_pvals[chr, pos] = log10_pval os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info( "[poolseq_tk]: Running Fisher's Exact tests successfully\n", "stderr") # correcting raw p-values and make QQ plots ColorText().info( "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." % (args.adj_method, args.adj_cutoff * 100), "stderr") raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") ColorText().info( "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".fisher.all" out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100) out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff * 100) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[k] log_pval = log10_pvals[k] odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[k] > 1) or (args.oddsr_direction == "less" and odds_ratios[k] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")