def run_filter(args): sz_utils.check_if_files_exist(args.ac_file) fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') before, after = 0, 0 # number of SNPs before and after filteration with open(args.ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") before += 1 ref_base = tmp_line[2] alt_base = tmp_line[3] fail = 0 for i in range(len(tmp_line[4:])): fail = apply_filter(tmp_line[4:][i], fail, args) if fail: break if not fail: fOUT.write(line) after += 1 fOUT.close() ColorText().info("Number of SNPs before filtering: %d\n" % (before), "stderr") ColorText().info("Number of SNPs after filtering: %d\n" % (after), "stderr")
def read_mpileup(mpileup_file, offset): ''' read certain columns in a pileup file into a dictionary of tuple ''' ColorText().info("[poolseq_tk]: reading %s ..." % (mpileup_file), "stderr") mpileup_info = collections.defaultdict(tuple) chr = "" sz_utils.check_if_files_exist(mpileup_file) with open(mpileup_file, 'r') as fMPILEUP: for line in fMPILEUP: tmp_line = line.strip().split("\t") chr = tmp_line[0] ''' key: SNP position in integer value: a tuple with two elements 1) ref base at the position 2) reads bases covering that position if the coverage at that position > 0 else N/A ''' # if int(tmp_line[3]) == 0: # the forth column: coverage at a position # mpileup_info[int(tmp_line[1])+offset] = (tmp_line[2].upper(), "N/A") if int(tmp_line[3]) > 0: mpileup_info[int(tmp_line[1]) + offset] = (tmp_line[2].upper(), tmp_line[4]) ColorText().info(" [done]\n", "stderr") return chr, mpileup_info
def run_overlap(args): ''' getting SNPs identified from both pools ''' sz_utils.check_if_files_exist(args.file_a, args.file_b) snp_a = collections.defaultdict(list) with open(args.file_a, 'r') as fA: for line in fA: tmp_line = line.strip().split("\t") snp_a[int(tmp_line[1])] = tmp_line ColorText().info( "[poolseq_tk]: %d SNPs parsed from %s\n" % (len(snp_a), os.path.basename(args.file_a)), "stderr") sz_utils.make_dirs_if_necessary(args.out) num_overlapion = 0 with open(args.out, 'w') as fOUT: with open(args.file_b, 'r') as fB: for line in fB: tmp_line = line.strip().split("\t") if int(tmp_line[1]) in snp_a: num_overlapion += 1 fOUT.write("%s\t%s\n" % ("\t".join(snp_a[int( tmp_line[1])]), "\t".join(tmp_line[-4:]))) ColorText().info( "[poolseq_tk]: %d SNPs identified from both pools\n" % (num_overlapion), "stderr")
def _count2table(ac_file, max_cov=100): ColorText().info( "[poolseq_tk]: reading counts and preparing 2*2 tables ...", "stderr") tables = collections.defaultdict(list) ntables_per_snp = 0 with open(ac_file, 'r') as fAC: for line in fAC: tmp_line = line.strip().split("\t") if ntables_per_snp == 0: ntables_per_snp = len(tmp_line[4:]) chr = tmp_line[0] pos = int(tmp_line[1]) base1 = tmp_line[2] base2 = tmp_line[3] tables[chr, pos] = [tmp_line[0], base1, base2] # chr, allele1, allele2 for counts in tmp_line[4:]: tmp_counts = counts.split(':') if sum(map(int, tmp_counts[0:2])) <= max_cov: tables[chr, pos] += tmp_counts[:2] if sum(map(int, tmp_counts[2:4])) <= max_cov: tables[chr, pos] += tmp_counts[2:] # tables[chr, pos] += counts.split(':') # counts if len(tables[chr, pos]) < len(tmp_line[4:]) * 4 + 3: del tables[chr, pos] ColorText().info(" [done]\n", "stderr") return tables, ntables_per_snp
def apply_filter(tmp_counts, fail, args): counts = map(int, tmp_counts.split(':')) if len(counts) < 2: ColorText().error("At least two counts (separated by colon) required" "for column five\n") sys.exit(1) for i in range(len(counts)): if i == 0: if counts[i] < args.min_ref_ac: fail = 1 break elif i == 1: if (counts[i] < args.min_alt_ac or counts[i] + counts[i - 1] < args.min_cov): fail = 1 break elif i == 2: if counts[i] < args.min_ref_ac: fail = 1 break elif i == 3: if (counts[i] < args.min_alt_ac or counts[i] + counts[i - 1] < args.min_cov): fail = 1 break return fail
def create_procs(nproc, task_q, result_q, ntables_per_snp, outp): ''' initialize processes ''' ColorText().info("[poolseq_tk]: Initializing processes ...\n", "stderr") for _ in range(nproc): p = mp.Process(target=cmh_worker, args=(task_q, result_q, ntables_per_snp, outp)) p.daemon = True p.start()
def read_mpileup(mpileup_file, offset): ''' read certain columns in a pileup file into a dictionary of tuple ''' ColorText().info("[poolseq_tk]: reading %s ..." %(mpileup_file), "stderr") dMpileups = collections.defaultdict(tuple) chr = "" sz_utils.check_if_files_exist(mpileup_file) with open(mpileup_file, 'r') as fMPILEUP: for line in fMPILEUP: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) cov = int(tmp_line[3]) ref_base = tmp_line[2].upper() if cov > 0: reads_bases = tmp_line[4] dMpileups[pos+offset] = (ref_base, reads_bases) ColorText().info(" [done]\n", "stderr") return chr, dMpileups
def addBlock(self, block): ''' Add a block to the blockchain :param block: The block to add ''' localBlock = block # Here we modify the attribute hashPreviousBlock of the block we want to # add to make it point to the previous block hash try: localBlock.hashPreviousBlock = self.listBlock[-1].blockHash except IndexError: pass if self.verify()[0]: self.listBlock.append(localBlock) print(ColorText.OkGreen("[VERIFICATION] - Pass")) else: print(ColorText.Fail("[VERIFICATION] - The block {} is not correct ! Cannot add the block {}".format(self.verify()[1], localBlock.blockIndex)))
def cmh_worker(task_q, result_q, ntables_per_snp, outp): while True: try: table_part, nth_job = task_q.get() pvals, odds_ratios = {}, {} ColorText().info("[poolseq_tk]: %s running Cochran-Mantel-Haenszel test on %d tables ...\n" %(mp.current_process().name, len(table_part)), "stderr") tmpFile = outp + "." + mp.current_process().name + ".cmh" fOUT = open(tmpFile, 'w') nTests = 0 for chr, pos in sorted(table_part.iterkeys()): array = [] i = 3 while i <= len(table_part[chr, pos])-4: if (i > 2 and sum(map(int, table_part[chr, pos][i:i+4])) >= 10 and int(table_part[chr, pos][i])+int(table_part[chr, pos][i+1]) >= 5 and int(table_part[chr, pos][i])+int(table_part[chr, pos][i+2]) >= 5 and int(table_part[chr, pos][i+2])+int(table_part[chr, pos][i+3]) >= 5 and int(table_part[chr, pos][i+1])+int(table_part[chr, pos][i+3]) >= 5): array += map(int, table_part[chr, pos][i:i+4]) i += 4 if len(array) == ntables_per_snp*4: dim_vector = robjects.IntVector([2, 2, ntables_per_snp]) data = robjects.r['array'](robjects.IntVector(array), dim=dim_vector) rcmh = robjects.r['mantelhaen.test'](data, alternative='t') pvalue = rcmh[2][0] nTests += 1 if pvalue == "NaN": pvalue = 1.0 if pvalue == 0.0: fOUT.write("%s\t%d\t%.4g\t%.8f\tInf\n" %(chr, pos, float(pvalue), float(rcmh[4][0]))) else: fOUT.write("%s\t%d\t%.8f\t%.8f\t%.8f\n" %(chr, pos, float(pvalue), float(rcmh[4][0]), -1*math.log10(pvalue))) fOUT.close() ColorText().info("[poolseq_tk]: %s ran %d tests\n" %(mp.current_process().name, nTests), "stderr") result_q.put(tmpFile) # result_q.put((pvals, odds_ratios)) finally: task_q.task_done()
def run_merge(args): ''' combine allele counts across replicates ''' allele_counts = collections.defaultdict(list) data = collections.defaultdict(list) for ac_file in args.acs: sz_utils.check_if_files_exist(ac_file) ColorText().info( "[poolseq_tk] reading and updating allele counts from %s ..." % (ac_file), "stderr") with open(ac_file) as fAC: for line in fAC: tmp_line = line.strip().split() pos = int(tmp_line[1]) if not pos in data: data[pos] = tmp_line[0:4] if not pos in allele_counts: allele_counts[pos] = map(int, tmp_line[4].split(':')) else: allele_counts[pos] = map( sum, zip(allele_counts[pos], map(int, tmp_line[4].split(':')))) ColorText().info(" [done]\n", "stderr") # output to file fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting to %s ..." % (fOUT.name), "stderr") for pos in sorted(allele_counts.iterkeys()): fOUT.write( "%s\t%s\n" % ("\t".join(data[pos]), ":".join(map(str, allele_counts[pos])))) ColorText().info(" [done]\n", "stderr")
def getFDR_BH(dPvals, fdr_level): ''' Using BH procedure to calculate pvalue cutoff at a FDR level ''' lPvals = [dPvals[k] for k in dPvals.iterkeys()] ntests = len(lPvals) sort_lPvals = sorted(lPvals) for i in xrange(len(sort_lPvals)): if sort_lPvals[i] > (float(i + 1) / ntests) * fdr_level: # print i, ntests, (float(i+1)/ntests)*fdr_level, sort_lPvals[i], sort_lPvals[i-1] if i == 0: return 0.00000000 else: return sort_lPvals[i - 1] if i == len(sort_lPvals): ColorText().error("[poolseq_tk] Fail to calculate pvalue cutoff\n") sys.exit()
def DisplayResultMerkle(listOriginalWords, listHash, finalHash): """ Display the result of the Merkle tree :param listOriginalWords: List with the original words :param listWords: List with the modified words :param listHash: List of hash :param finalHash: Final hash """ print( ColorText.OkBlue("######### Merkle Tree #########\n\n") + ColorText.OkBlue("Original word") + " -> " + ColorText.OkGreen("Hash")) for i in range(len(listOriginalWords)): print( ColorText.OkBlue(listOriginalWords[i]) + " -> " + ColorText.OkGreen(listHash[i])) print( ColorText.Fail("\nFinal hash of the root : {}\n").format(finalHash[0]))
def __init__(self, complexity): self.listBlock = [] self.complexity = complexity print(ColorText.OkBlue("######### Blockchain #########\n"))
import colortext.ColorText as ColorText if __name__ == '__main__': ctext = ColorText() text = ctext.colorstring("ColorText Python", color=ctext.HEADER) text += ctext.colorstring(" class", color=ctext.OKBLUE) print(text, '\n') print('https://github.com/bessavagner', '\n') ctext.cprint("This is a default usage of cprint function") print('\n') text = ctext.colorstring('affect', color=ctext.WARNING) ctext.cprint(f"Of course you can {text} the behavior" f" of {ctext.colorstring('cprint', color=ctext.UNDERLINE)}" f" and override it's color.") ctext.cprint("But you can avoid it by passing", endcolor=False, end='') ctext.cprint("endcolor=False", color=ctext.OKBLUE, end='') ctext.cprint("as parameter.")
def run_collapse(isnp, m1, m2, out): ''' Given two pileup files of the same region, like 2l+ and 2la, collapse the pileups at each corresponding SNP Some SNPs are not reported in one or the other pileup file. A full list of SNP positions are required ''' m1_base = os.path.basename(m1) m2_base = os.path.basename(m2) # first, getting the full list of SNPs dSNPs = sz_utils.getSNPs(isnp) offset1 = 0 offset2 = 20524057 # second, reading each of the pileup files chr1, m1_info = read_mpileup(m1, offset1) chr2, m2_info = read_mpileup(m2, offset2) ColorText().info( "[poolseq_tk] %s: %d SNPs parsed\n" % (m1_base, len(m1_info)), "stderr") ColorText().info( "[poolseq_tk] %s: %d SNPs parsed\n" % (m2_base, len(m2_info)), "stderr") # fOUT = None # if args.out != sys.stdout: # outdir = os.path.dirname(os.path.realpath(args.out)) # sz_utils.make_dirs_if_necessary(outdir) # fOUT = open(args.out, 'w') # else: # fOUT = args.out fOUT = open(out, 'w') ColorText().info( "[poolseq_tk]: collapsing mpileups %s and %s ..." % (m1_base, m2_base), "stderr") for (chr, pos) in sorted(dSNPs.iterkeys()): k = (chr, pos) reads_bases_collapsed = "" refBase1, refBase2 = "", "" if pos in m1_info: if m1_info[pos][0] == dSNPs[k][0]: refBase1 = m1_info[pos][0] else: ColorText().error( "SNP position: %d %s\t\tMpileup position: %d %s\n" % (pos, dSNPs[k][0], pos, m1_info[pos][0])) sys.exit() else: refBase1 = "" if pos in m2_info: if m2_info[pos][0] == dSNPs[k][1]: refBase2 = m2_info[pos][0] else: ColorText().error( "SNP position: %d %s\t\tMpileup position: %d %s\n" % (pos, dSNPs[k][1], pos, m2_info[pos][0])) sys.exit() else: refBase2 = "" if refBase1 != "" and refBase2 != "": reads_bases_collapsed1, nReadsBases1, nRefBases1, dMultiBases1, dIndels1 = sz_utils.parseReadsBases( m1_info[pos][1], refBase1, refBase2) reads_bases_collapsed2, nReadsBases2, nRefBases2, dMultiBases2, dIndels2 = sz_utils.parseReadsBases( m2_info[pos][1], refBase2, refBase1) reads_bases_collapsed = reads_bases_collapsed1 + reads_bases_collapsed2 nReadsBases = nReadsBases1 + nReadsBases2 nRefBases = nRefBases1 + (nReadsBases2 - nRefBases2) dMultiBases = dict(dMultiBases1.items() + dMultiBases2.items()) dIndels = dict(dIndels1.items() + dIndels2.items()) nMultiBases = sum(dMultiBases.values()) + sum(dIndels.values()) if (nReadsBases == nRefBases or nMultiBases <= 1): fOUT.write( "%s\t%d\t%s\t%s\t%s\n" % (chr, pos, refBase1, refBase2, reads_bases_collapsed)) elif refBase1 == "" and refBase2 != "": reads_bases_collapsed2, nReadsBases2, nRefBases2, dMultiBases2, dIndels2 = sz_utils.parseReadsBases( m2_info[pos][1], refBase2, refBase1) nMultiBases = sum(dMultiBases2.values()) + sum(dIndels2.values()) if (nReadsBases2 == nRefBases2 or nMultiBases <= 1): fOUT.write( "%s\t%d\t%s\t%s\t%s\n" % (chr, pos, dSNPs[k][0], refBase2, reads_bases_collapsed2)) elif refBase1 != "" and refBase2 == "": reads_bases_collapsed1, nReadsBases1, nRefBases1, dMultiBases1, dIndels1 = sz_utils.parseReadsBases( m1_info[pos][1], refBase1, refBase2) nMultiBases = sum(dMultiBases1.values()) + sum(dIndels1.values()) if (nReadsBases1 == nRefBases1 or nMultiBases <= 1): fOUT.write( "%s\t%d\t%s\t%s\t%s\n" % (chr, pos, refBase1, dSNPs[k][1], reads_bases_collapsed1)) ColorText().info(" [done]\n", "stderr") fOUT.close()
def fisher_worker(task_q, result_q, outp): while True: try: tables, nth_job = task_q.get() ColorText().info( "[poolseq_tk]: %s running Fisher's Exact test on %d tables ...\n" % (mp.current_process().name, len(tables)), "stderr") tmpFile = outp + "." + mp.current_process().name + ".fisher" fOUT = open(tmpFile, 'w') pvals_split, odds_ratios_split = {}, {} nTests = 0 for k in sorted(tables.iterkeys()): oddsr = 0.0 chr = k[0] pos = k[1] alt_base = tables[k][2] ref_base = tables[k][1] ref_ac1 = int(tables[k][3]) alt_ac1 = int(tables[k][4]) ref_ac2 = int(tables[k][5]) alt_ac2 = int(tables[k][6]) if (sum(map(int, tables[k][3:7])) >= 10 and alt_ac1 + ref_ac1 >= 5 and # row subtotals alt_ac2 + ref_ac2 >= 5 and alt_ac1 + alt_ac2 >= 5 and # column subtotals ref_ac1 + ref_ac2 >= 5): nTests += 1 if (ref_ac1 == 0 or ref_ac2 == 0 or # add pseudo counts in case alt_ac1 == 0 or alt_ac2 == 0): # odds ratio goes to Inf ref_ac1 += 1 ref_ac2 += 1 alt_ac1 += 1 alt_ac2 += 1 data_vector = robjects.IntVector( [ref_ac1, alt_ac1, ref_ac2, alt_ac2]) table = robjects.r['matrix'](data_vector, ncol=2) rfisher = robjects.r['fisher.test'](table, alternative='t') # pvals_split[pos] = float(rfisher[0][0]) # if (ref_ac1 == 0 or ref_ac2 == 0 or # alt_ac1 == 0 or alt_ac2 == 0): # oddsr = (float(ref_ac1+1)/(alt_ac1+1))/(float(ref_ac2+1)/(alt_ac2+1)) # else: pvalue = float(rfisher[0][0]) oddsr = rfisher[2][0] # odds_ratios_split[pos] = oddsr if pvalue == 0.0: fOUT.write("%s\t%d\t%.4g\t%.8f\tInf\n" % (chr, pos, pvalue, oddsr)) elif pvalue == 1.0: fOUT.write("%s\t%d\t%.4g\t%.8f\t0.00000000\n" % (chr, pos, pvalue, oddsr)) else: fOUT.write( "%s\t%d\t%.8f\t%.8f\t%.8f\n" % (chr, pos, pvalue, oddsr, -1 * math.log10(pvalue))) fOUT.close() ColorText().info( "[poolseq_tk]: %s ran %d tests\n" % (mp.current_process().name, nTests), "stderr") result_q.put(tmpFile) # result_q.put((pvals_split, odds_ratios_split)) finally: task_q.task_done()
def run_fisher(args): ''' run Fisher's Exact test ''' sz_utils.make_dirs_if_necessary(args.outp) sz_utils.check_if_files_exist(args.ac_file) tables = sz_utils._count2table(args.ac_file)[0] task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc, task_q, result_q, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: pvals, odds_ratios, log10_pvals = {}, {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) log10_pval = tmp_line[4] if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio if (chr, pos) not in log10_pvals: log10_pvals[chr, pos] = log10_pval os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info( "[poolseq_tk]: Running Fisher's Exact tests successfully\n", "stderr") # correcting raw p-values and make QQ plots ColorText().info( "[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." % (args.adj_method, args.adj_cutoff * 100), "stderr") raw_pvals = [pvals[k] for k in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") ColorText().info( "[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" % (sz_utils.getFDR_BH(pvals, args.adj_cutoff)), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".fisher.all" out_fdr = args.outp + ".fisher.fdr%d" % (args.adj_cutoff * 100) out_expect = args.outp + ".fisher.fdr%d.expect" % (args.adj_cutoff * 100) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[k] log_pval = log10_pvals[k] odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[k] > 1) or (args.oddsr_direction == "less" and odds_ratios[k] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[k][1:3]), tables[k][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
def run_count(args): ''' Counting alleles at each SNP in the given pileup files ''' dPos = {} if args.pos: ColorText().info("[poolseq_tk] reading SNPs positions:", "stderr") with open(args.pos, 'r') as fPOS: for line in fPOS: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (chr, pos) not in dPos: dPos[chr, pos] = 1 ColorText().info(" %d\n" % (len(dPos)), "stderr") else: ColorText().info( "[poolseq_tk] no SNP positions provided ... [skipped]\n", "stderr") ac = collections.defaultdict(tuple) for pileup in args.pileups: sz_utils.check_if_files_exist(pileup) nsnps = 0 ColorText().info( "[poolseq_tk] counting alleles in %s:" % (os.path.basename(pileup)), "stderr") with open(pileup, 'r') as fMPILEUP: for line in fMPILEUP: nsnps += 1 tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if (((chr, pos) in dPos and args.pos) or (len(dPos) == 0 and not args.pos)): ref_base = tmp_line[2] alt_base = tmp_line[3] nRefAlleles, nAltAlleles = 0, 0 if len(tmp_line) == 5: nRefAlleles = tmp_line[-1].count(ref_base) + \ tmp_line[-1].count(ref_base.lower()) nAltAlleles = tmp_line[-1].count(alt_base) + \ tmp_line[-1].count(alt_base.lower()) if (chr, pos) not in ac: ac[chr, pos] = [ ref_base, alt_base, str(nRefAlleles), str(nAltAlleles) ] else: ac[chr, pos] += [str(nRefAlleles), str(nAltAlleles)] ColorText().info(" %d SNPs parsed\n" % (nsnps), "stderr") fOUT = None if args.out == sys.stdout: fOUT = sys.stdout else: sz_utils.make_dirs_if_necessary(args.out) fOUT = open(args.out, 'w') ColorText().info("[poolseq_tk] outputting allele counts to table ...", "stderr") for k in sorted(ac.iterkeys()): chr = k[0] pos = k[1] i = 2 if len(ac[k][i:]) == 2 * len(args.pileups): fOUT.write("%s\t%d\t%s" % (chr, pos, "\t".join(ac[k][0:2]))) while i <= len(ac[k]) - 4: fOUT.write("\t%s" % (":".join(ac[k][i:i + 4]))) i += 4 fOUT.write("\n") ColorText().info(" [done]\n", "stderr") fOUT.close()
def making_plot(args): ''' making Q-Q plot and Manhattan plot ''' # install qqman package if not installed if not rpackages.isinstalled("qqman"): rutils = rpackages.importr('utils') rutils.chooseCRANmirror(ind=84) rutils.install_packages("qqman") # get pvalues ColorText().info("[poolseq_tk]: Extracting P-Values ... ", "stderr") data = collections.defaultdict() chrs = [] pvals, adjust_pvals = {}, {} nchr = 0 with open(args.input, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) if chr not in chrs: chrs.append(chr) nchr += 1 data[chr, pos] = nchr pvals[chr, pos] = float(tmp_line[8]) ColorText().info(" [done]\n", "stderr") # get FDR cutoff using BH if not provided through command line pcutoff = 0.0 if not args.pcutoff: ColorText().info( "[poolseq_tk]: Getting p-value cutoff at FDR %d%%: " % (args.fdrlevel * 100), "stderr") pcutoff = sz_utils.getFDR_BH(pvals, args.fdrlevel) ColorText().info("%.5e\n" % (pcutoff), "stderr") else: pcutoff = args.pcutoff ColorText().info( "[poolseq_tk]: p-value cutoff provided: %.5e\n" % (pcutoff), "stderr") # get SNPs to highlight snps_to_highlight = [] if args.highlight_snps: ColorText().info( "[poolseq_tk]: Getting SNPs to be highlighed in Manhattan plot ... ", "stderr") with open(args.highlight_snps, 'r') as fHIGHLIGHT: for line in fHIGHLIGHT: tmp_line = line.strip().split("\t") snps_to_highlight.append('_'.join(tmp_line[:2])) ColorText().info(" [done]\n", "stderr") if args.pdf: out_qqplot = args.outp + ".qqplot.pdf" out_manhattan = args.outp + ".manhattan.pdf" elif args.png: # save to PNG probably wont work out_qqplot = args.outp + ".qqplot.png" out_manhattan = args.outp + ".manhattan.png" sz_utils.make_dirs_if_necessary(out_qqplot, out_manhattan) grdevices = rpackages.importr('grDevices') raw_pvals_vector = robjects.FloatVector( [pvals[k] for k in sorted(pvals.iterkeys())]) ColorText().info("[poolseq_tk]: Making Q-Q plot ...", "stderr") make_qqplots(grdevices, raw_pvals_vector, out_qqplot, args.qqtitle) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Making Manhattan plot ...", "stderr") make_manhattan(grdevices, data, raw_pvals_vector, snps_to_highlight, pcutoff, out_manhattan, args.mantitle, args.manx, args.manxlim) ColorText().info(" [done]\n", "stderr")
def run_cmh(args): ''' run Cochran-Mantel-Hasenzle test ''' sz_utils.make_dirs_if_necessary(args.outp) allele_counts = {} pvals = {} tables = collections.defaultdict(list) ntests = 0 tables, ntables_per_snp = sz_utils._count2table(args.table_file) ColorText().info("[poolseq_tk]: %d tables prepared\n" %(len(tables)), "stderr") task_q = mp.JoinableQueue() result_q = mp.Queue() create_procs(args.nproc,task_q, result_q, ntables_per_snp, args.outp) sz_utils._assign_tables(tables, task_q, args.nproc) # waiting for all tasks to be finished try: task_q.join() except KeyboardInterrupt: ColorText().info("[poolseq_tk]: Terminated unexpectedly by keyboard\n", "stderr") sys.exit() else: # merge results pvals, odds_ratios = {}, {} while args.nproc: file = result_q.get() with open(file, 'r') as fIN: for line in fIN: tmp_line = line.strip().split("\t") chr = tmp_line[0] pos = int(tmp_line[1]) pval = float(tmp_line[2]) odds_ratio = float(tmp_line[3]) if (chr, pos) not in pvals: pvals[chr, pos] = pval if (chr, pos) not in odds_ratios: odds_ratios[chr, pos] = odds_ratio os.remove(file) # pvals_split, odds_ratios_split = result_q.get() # pvals.update(pvals_split) # odds_ratios.update(odds_ratios_split) args.nproc -= 1 ColorText().info("[poolseq_tk]: Running CMH tests successfully\n", "stderr") # correcting raw p-values ColorText().info("[poolseq_tk]: multi-testing correction using %s method at %d%% level ..." %(args.adj_method, args.adj_cutoff*100), "stderr") raw_pvals = [pvals[chr, pos] for chr, pos in sorted(pvals.iterkeys())] raw_pvals_vector = robjects.FloatVector(raw_pvals) padjust = robjects.r['p.adjust'](raw_pvals_vector, method=args.adj_method) ColorText().info(" [done]\n", "stderr") pcutoff = sz_utils.getFDR_BH(pvals, args.adj_cutoff) ColorText().info("[poolseq_tk]: p-value cutoff using Benjamini.Hochberg procedure %.5e" %(pcutoff), "stderr") ColorText().info(" [done]\n", "stderr") # output p-values ColorText().info("[poolseq_tk]: output to files ...", "stderr") out_all = args.outp + ".cmh.all" out_fdr = args.outp + ".cmh.fdr%d" %(args.adj_cutoff*100) out_expect = args.outp + ".cmh.fdr%d.expect" %(args.adj_cutoff*100) sz_utils.make_dirs_if_necessary(out_all, out_fdr) with open(out_all, 'w') as fALL, \ open(out_fdr, 'w') as fFDR, \ open(out_expect, 'w') as fEXPECT: for i, k in enumerate(sorted(pvals.iterkeys())): chr = k[0] pos = k[1] raw_pval = pvals[chr, pos] log_pval = None if raw_pval == 0.0: log_pval = "Inf" elif raw_pval == "Nan": raw_pval = 1.0 log_pval = 0.0 else: log_pval = -1 * math.log10(raw_pval) odds_ratio = odds_ratios[k] if padjust[i] <= args.adj_cutoff: sz_utils._results_outputter(fFDR, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) if ((args.oddsr_direction == "greater" and odds_ratios[chr, pos] > 1) or (args.oddsr_direction == "less" and odds_ratios[chr, pos] < 1)): sz_utils._results_outputter(fEXPECT, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) sz_utils._results_outputter(fALL, pos, chr, "\t".join(tables[chr, pos][1:3]), tables[chr, pos][3:], raw_pval, log_pval, padjust[i], odds_ratio) ColorText().info(" [done]\n", "stderr") ColorText().info("[poolseq_tk]: Program finishes successfully\n", "stderr")
def run_collapse(args): ''' Given two pileup files of the same region, like 2l+ and 2la, collapse the pileups at each corresponding SNP Some SNPs are not reported in one or the other pileup file. A full list of SNP positions are required ''' m1_base = os.path.basename(args.m1) m2_base = os.path.basename(args.m2) # first, getting the full list of SNPs dSNPs = get_SNPs(args.snps) # second, reading each of the pileup files chr1, dM1 = read_mpileup(args.m1, args.offset1) chr2, dM2 = read_mpileup(args.m2, args.offset2) ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m1_base, len(dM1)), "stderr") ColorText().info("[poolseq_tk] %s: %d SNPs parsed\n" %(m2_base, len(dM2)), "stderr") fOUT = None if args.out != sys.stdout: outdir = os.path.dirname(os.path.realpath(args.out)) sz_utils.make_dirs_if_necessary(outdir) fOUT = open(args.out, 'w') else: fOUT = args.out ColorText().info("[poolseq_tk]: collapsing mpileups %s and %s ..." %(m1_base, m2_base), "stderr") for pos in sorted(dSNPs.iterkeys()): reads_bases_collapsed = "" if pos in dM1 and pos in dM2: ''' dSNPs[pos][0]: ref base of m1 pileup dSNPs[pos][1]: ref base of m2 pileup dM1[pos][0]: ref base of m1 pileup dM2[pos][1]: ref base of m2 pileup ''' if dSNPs[pos][0] == dM1[pos][0] and dSNPs[pos][1] == dM2[pos][0]: reads_bases_collapsed = parseReadsBases(dM1[pos][0], dM2[pos][0], dM1[pos][1]) reads_bases_collapsed += parseReadsBases(dM2[pos][0], dM1[pos][0], dM2[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) else: # this should bark if the same sites having different states ColorText().error("SNP position: %d %s %s\t\tMpileup position: %d %s %s\n" %(pos, dSNPs[pos][0], dSNPs[pos][1], pos, dM1[pos][0], dM2[pos][0]), "stderr") # SNPS missed in both pileup file elif pos not in dM1 and pos not in dM2: fOUT.write("%s/%s\t%d\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1])) # SNPs in m1 pileup file but not in m2 elif pos in dM1 and pos not in dM2: reads_bases_collapsed = parseReadsBases(dM1[pos][0], dSNPs[pos][1], dM1[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) # SNPs in m2 pileup file but not in m1 elif pos not in dM1 and pos in dM2: reads_bases_collapsed = parseReadsBases(dM2[pos][0], dSNPs[pos][0], dM2[pos][1]) fOUT.write("%s/%s\t%d\t%s\t%s\t%s\n" %(chr1, chr2, pos, dSNPs[pos][0], dSNPs[pos][1], reads_bases_collapsed)) ColorText().info(" [done]\n", "stderr") fOUT.close()
def check_if_files_exist(*files): for file in files: if not os.path.exists(file): ColorText().error("\n[poolseq_tk] ERROR: cannot find file %s\n" % (os.path.realpath(file))) sys.exit(1)