def knownRegions(refdict, chromosomes): ends = {c: None for c in chromosomes} assert os.path.isfile(refdict) with open(refdict, 'r') as file: for line in file: if "@SQ" in line: assert "SN:" in line and "LN:" in line c = line.split('SN:')[1].split()[0] if c[3:] != "chr": c = "chr{}".format(c) if c in chromosomes: end = int(line.split('LN:')[1].split()[0]) ends[c] = end ''' with open(refdict, 'r') as i: for l in i: if '@SQ' in l: assert 'SN:' in l and 'LN:' in l c = l.split('SN:')[1].split()[0] if c in chromosomes: end = int(l.split('LN:')[1].split()[0]) ends[c] = end ''' if None in ends.values(): log(msg= "The following chromosomes have not been found in the dictionary of the reference genome: \n\t{}" .format(','.join([c for c in ends if ends[c] is None])), level="WARN") res = {} for c in chromosomes: res[c] = [(0, ends[c])] return res
def main(): log(msg= "# Parsing the input arguments, checking the consistency of given files, and extracting required " "information\n", level="STEP") args = ap.parse_baf_arguments() logArgs(args, 80) if args["reference"]: defaultMode(args) else: naiveMode(args)
def defaultMode(args): log(msg="# Inferring SNPs from the normal sample\n", level="STEP") snps = SNPCalling.call(samtools=args["samtools"], bcftools=args["bcftools"], reference=args["reference"], samples=[args["normal"]], chromosomes=args["chromosomes"], num_workers=args["j"], snplist=args["snps"], q=args["q"], Q=args["Q"], qual=args["qual"], mincov=args["mincov"], dp=args["maxcov"], E=args["E"], regions=args["regions"], verbose=args["verbose"]) if not snps: sp.close("No SNPs found in the normal!\n") log(msg="# Selecting heterozygous SNPs\n", level="STEP") hetSNPs = selectHetSNPs(counts=snps, gamma=args["gamma"], maxshift=args["maxshift"], verbose=args["verbose"]) if not hetSNPs: sp.close( "No heterozygous SNPs found in the selected regions of the normal!\n" ) log(msg= "# Writing the list of selected SNPs, covered and heterozygous in the normal sample\n", level="STEP") with open(args["outputSnps"], 'w') as f: for chro in args["chromosomes"]: if (args["normal"][1], chro) in hetSNPs: for snp in hetSNPs[args["normal"][1], chro]: f.write("{}\t{}\n".format(snp[1], snp[2])) log(msg= "# Writing the allele counts of the normal sample for selected SNPs\n", level="STEP") if args["outputNormal"] is not None: with open(args["outputNormal"], 'w') as f: for chro in args["chromosomes"]: if (args["normal"][1], chro) in hetSNPs: for count in hetSNPs[args["normal"][1], chro]: f.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) else: for chro in args["chromosomes"]: if (args["normal"][1], chro) in hetSNPs: for count in hetSNPs[args["normal"][1], chro]: sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) log(msg="# Counting the alleles of tumor samples for selected SNPs\n", level="STEP") counts = \ AlleleCounting.count(samtools=args["samtools"], bcftools=args["bcftools"], reference=args["reference"], samples=args["samples"], chromosomes=args["chromosomes"], num_workers=args["j"], snplist=args["outputSnps"], q=args["q"], Q=args["Q"], mincov=args["mincov"], dp=args["maxcov"], E=args["E"], verbose=args["verbose"]) if not counts: sp.close("The selected SNPs are not covered in the tumors!\n") log(msg="# Writing the allele counts of tumor samples for selected SNPs\n", level="STEP") if args["outputTumors"] is not None: with open(args["outputTumors"], 'w') as f: for sample in args["samples"]: for chro in args["chromosomes"]: if (sample[1], chro) in counts: for count in counts[sample[1], chro]: f.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) else: for sample in args["samples"]: for chro in args["chromosomes"]: if (sample[1], chro) in counts: for count in counts[sample[1], chro]: sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4]))
def logArgs(args, width): text = "\n" for key in args: text += "\t{}: {}\n".format(key, args[key]) log(msg=text, level="INFO")
def main(): log(msg="# Parsing and checking input arguments\n", level="STEP") args = ap.parse_bin_arguments() logArgs(args, 80) if args["regions"] is None: log(msg= "# Retrieving genomic regions to consider from maximum chromosome length\n", level="STEP") regions = knownRegions(args["refdict"], args["chromosomes"]) else: log(msg="# Checking the consistency of the given regions\n", level="STEP") regions = ap.parseRegions(args["regions"], args["chromosomes"]) if args["verbose"]: msg = "regions: " for c in args["chromosomes"]: msg += " {}: {}".format(c, regions[c]) msg += "\n" log(msg=msg, level="INFO") log(msg="# Binning and counting the normal sample\n", level="STEP") normal_bins = \ bb.bin(samtools=args["samtools"], samples=[args["normal"]], chromosomes=args["chromosomes"], num_workers=args["j"], q=args["q"], size=args["size"], regions=regions, verbose=args["verbose"]) if not normal_bins: raise SystemExit("No bins in the normal sample!\n") log(msg="# Writing the read counts for bins of normal sample\n", level="STEP") if args["outputNormal"] is not None: with open(args["outputNormal"], 'w') as f: for c in args["chromosomes"]: for count in normal_bins[args["normal"][1], c]: f.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) else: for c in args["chromosomes"]: for count in normal_bins[args["normal"][1], c]: sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) log(msg="# Binning and counting the tumor samples\n", level="STEP") tumor_bins = \ bb.bin(samtools=args["samtools"], samples=args["samples"], chromosomes=args["chromosomes"], num_workers=args["j"], q=args["q"], size=args["size"], regions=regions, verbose=args["verbose"]) if not tumor_bins: log.close("No bins in the tumor samples!\n") log(msg="# Writing the read counts for bins of tumor samples\n", level="STEP") if args["outputTumors"] is not None: with open(args["outputTumors"], 'w') as f: for sample in args["samples"]: for c in args["chromosomes"]: for count in tumor_bins[sample[1], c]: f.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) else: for sample in args["samples"]: for c in args["chromosomes"]: for count in tumor_bins[sample[1], c]: sys.stdout.write("{}\t{}\t{}\t{}\t{}\n".format( count[0], count[1], count[2], count[3], count[4])) log(msg="# Counting total number of reads for normal and tumor samples\n", level="STEP") total_counts = \ tc.tcount(samtools=args["samtools"], samples=({args["normal"]}|args["samples"]), chromosomes=args["chromosomes"], num_workers=args["j"], q=args["q"], verbose=args["verbose"]) try: total = { sample[1]: sum(total_counts[sample[1], chromosome] for chromosome in args["chromosomes"]) for sample in args["samples"] } total[args["normal"][1]] = sum(total_counts[args["normal"][1], chromosome] for chromosome in args["chromosomes"]) except KeyError: raise KeyError( "Either a chromosome or a sample has not been considered in the total counting!" ) log(msg="# Writing the total read counts for all samples in {}\n".format( args["outputTotal"]), level="STEP") with open(args["outputTotal"], 'w') as f: f.write("{}\t{}\n".format(args["normal"][1], total[args["normal"][1]])) for sample in args["samples"]: f.write("{}\t{}\n".format(sample[1], total[sample[1]]))