def create_sorted_bam(samfile, outlabel, logfh, force, keepsam): bamfile = outlabel + '.bam' logfh.write("\nSorting sam: {}\n".format(samfile)) logfh.write("Creating bam: {}\n".format(bamfile)) cmd = picardExe[:] + [ 'SortSam', 'I='+samfile, 'O='+bamfile, 'SORT_ORDER=coordinate'] logfh.write(" ".join(cmd)+"\n") if have_file(bamfile, force, stderr=logfh): logfh.write("Already have bam file: {}\n".format(bamfile)) else: logfh.flush() subprocess.call(cmd, stderr=logfh) if not keepsam and have_file(bamfile, stderr=logfh): remove_file(samfile, stderr=logfh) return bamfile
def run_gatk(bamfile, label, ref, args, logfh=sys.stderr): caller = 'HaplotypeCaller' outvcf = label + '.gatk.vcf' if args.cohort: outvcf = outvcf.replace('.vcf', '-cohort.vcf') cmd = gatkExe[:] + ['-T', caller, '--genotyping_mode', 'DISCOVERY'] cmd.extend(['-R', ref, '-I', bamfile, '-o', outvcf]) if args.intervals: cmd.extend(['-L', args.intervals]) if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp]) if args.maxreads: cmd.extend(['--maxReadsInRegionPerSample', str(args.maxreads)]) if args.debug: bamout = label + '.gatk-debug.bam' cmd.extend(['-bamout', bamout]) if args.cohort: cmd.extend(['-ERC', 'GVCF', '--variant_index_type', 'LINEAR']) cmd.extend(['--variant_index_parameter', '128000']) # if args.basequal: # cmd.extend(['-mbq', str(args.basequal)]) logfh.write("\nGATK: " + " ".join(cmd) + "\n") if have_file(outvcf, args.force, stderr=logfh): logfh.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if not os.path.isfile(outvcf): logfh.write(" Failed to create {}\n".format(outvcf)) sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def calculate_depths(label, infiles, args): bamfile = infiles[label]['bam'] vcffile = infiles[label]['vcf'] outlabel = get_outlabel(bamfile, 'bam', args.outdir) loglabel = get_outlabel(bamfile, 'bam', args.logdir) if args.logdir \ else outlabel logfile = loglabel + ".calculate_depths.log" outfile = outlabel + '.depths{}.txt'.format(args.tag) sys.stderr.write(" Calculating depths for {}\n".format(outlabel)) sys.stderr.write(" Log file: {}\n".format(logfile)) logfh = open(logfile, 'w') logfh.write("Bam file: {}\n".format(bamfile)) logfh.write("VCF file: {}\n\n".format(vcffile)) try: if have_file(outfile, args.force, stderr=logfh): logfh.write(" Already have {}\n".format(outfile)) else: logfh.write("Start time: {}\n".format(timestamp())) (vcffields, variants) = parse_vcf(vcffile, logfh) depths = find_variant_depths(variants, bamfile, logfh, args) add_fwd_rev_depths(depths, variants, bamfile, args) print_depths(outfile, variants, depths, vcffields, logfh) logfh.write("End time: {}\n".format(timestamp())) except Exception, e: e.args += (vcffile, ) raise
def create_annovar_input_file(vcffiles, outlabel, args): outfile = outlabel + '.avinput'; cmd = [annovarInputExe, '-format', 'vcf4', '-allsample', '-withfreq', ] cmd.extend([ '-includeinfo', ]) sys.stderr.write("\nCreating annovar input file: "+" ".join(cmd)+"\n") if have_file(outfile, args.force): sys.stderr.write(" Already have {}.\n".format(outfile)) else: lines = [] for vcffile in vcffiles: sys.stderr.write(" Running: {} {}\n".format(" ".join(cmd), vcffile)) output = subprocess.check_output(cmd + [vcffile,]) if not output: sys.stderr.write(" No output.\n") else: for line in output.split("\n"): v = line.split("\t")[0:5] if len(v)==5: (chrom, v[1]) = cftr.CFTR_to_hg19(v[0], v[1]) (v[0], v[2]) = cftr.CFTR_to_hg19(v[0], v[2]) lines.append("\t".join(map(str, v))) uniqlines = sorted(set(lines)) with open(outfile, 'w') as ofh: ofh.write("\n".join(uniqlines)) if not os.path.isfile(outfile): sys.stderr.write(" Failed to create {}\n".format(outfile)) sys.exit(1) return outfile
def cohort_merge_gvcfs(vcfs, ref, args): sys.stderr.write("Genotyping gVCFs: {} vcfs\n".format(len(vcfs))) label = args.cohort outlabel = os.path.join(args.outdir, label) if args.outdir else label outvcf = outlabel + ".gatk-merged.vcf" cmd = gatkExe[:] + ["-T", "GenotypeGVCFs"] cmd.extend(["-R", ref, "-o", outvcf]) if args.intervals: cmd.extend(["-L", args.intervals]) if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp]) variants = ("--variant " + " --variant ".join(vcfs)).split(" ") cmd += variants if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: loglabel = os.path.join(args.logdir, label) if args.logdir else label logfile = loglabel + ".gatk-merged.log" sys.stderr.write(" Log file {}\n".format(logfile)) with open(logfile, "w") as logfh: logfh.write("CMD: {}\n".format(cmd)) logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Merged gVCF: {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def run_gatk(bamfile, label, ref, args, logfh=sys.stderr): caller = "HaplotypeCaller" outvcf = label + ".gatk.vcf" if args.cohort: outvcf = outvcf.replace(".vcf", "-cohort.vcf") cmd = gatkExe[:] + ["-T", caller, "--genotyping_mode", "DISCOVERY"] cmd.extend(["-R", ref, "-I", bamfile, "-o", outvcf]) if args.intervals: cmd.extend(["-L", args.intervals]) if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp]) if args.maxreads: cmd.extend(["--maxReadsInRegionPerSample", str(args.maxreads)]) if args.debug: bamout = label + ".gatk-debug.bam" cmd.extend(["-bamout", bamout]) if args.cohort: cmd.extend(["-ERC", "GVCF", "--variant_index_type", "LINEAR"]) cmd.extend(["--variant_index_parameter", "128000"]) # if args.basequal: # cmd.extend(['-mbq', str(args.basequal)]) logfh.write("\nGATK: " + " ".join(cmd) + "\n") if have_file(outvcf, args.force, stderr=logfh): logfh.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if not os.path.isfile(outvcf): logfh.write(" Failed to create {}\n".format(outvcf)) sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def cohort_merge_gvcfs(vcfs, ref, args): sys.stderr.write("Genotyping gVCFs: {} vcfs\n".format(len(vcfs))) label = args.cohort outlabel = os.path.join(args.outdir, label) if args.outdir else label outvcf = outlabel + '.gatk-merged.vcf' cmd = gatkExe[:] + ['-T', 'GenotypeGVCFs'] cmd.extend(['-R', ref, '-o', outvcf]) if args.intervals: cmd.extend(['-L', args.intervals]) if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp]) variants = ("--variant " + " --variant ".join(vcfs)).split(' ') cmd += variants if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: loglabel = os.path.join(args.logdir, label) if args.logdir else label logfile = loglabel + '.gatk-merged.log' sys.stderr.write(" Log file {}\n".format(logfile)) with open(logfile, 'w') as logfh: logfh.write("CMD: {}\n".format(cmd)) logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Merged gVCF: {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def run_annovar(annov_input, outlabel, refdir, args): outfile1 = outlabel + '-hgvs.variant_function' outfile2 = outlabel + '-hgvs.exonic_variant_function' cmd = [annovarExe, '-build', 'hg19', '-hgvs', '-out', outlabel+'-hgvs', ] cmd.extend([ annov_input, refdir ]) sys.stderr.write("\nRunning annovar: "+" ".join(cmd)+"\n") if have_file(outfile1, args.force) and have_file(outfile2, args.force): sys.stderr.write(" Already have {} and {}.\n".format(outfile1, outfile2)) else: subprocess.check_call(cmd) if not os.path.isfile(outfile1): sys.stderr.write(" Failed to create {}\n".format(outfile1)) sys.exit(1) if not os.path.isfile(outfile2): sys.stderr.write(" Failed to create {}\n".format(outfile2)) sys.exit(1) return (outfile1, outfile2)
def run_bwa(sample, outlabel, ref, fqfiles, logfh, force): samfile = outlabel + ".sam" bamfile = outlabel + ".bam" logfh.write("Output sam: {}\n".format(samfile)) readgroup = "\\t".join(['@RG', "ID:"+sample, "SM:"+sample, "PL:Illumina", "LB:"+sample, "PU:unit1"]); cmd = [bwaExe, 'mem', '-M', '-R', readgroup, ref,] + fqfiles logfh.write(" ".join(cmd)+"\n") if have_file(samfile, force, stderr=logfh): logfh.write("Already have sam file: {}\n".format(samfile)) elif have_file(bamfile, force, stderr=logfh): logfh.write("Already have bam file: {}\n".format(bamfile)) else: logfh.flush() output = subprocess.check_output(cmd, stderr=logfh) with open(samfile, 'w') as ofh: ofh.write(output) return samfile
def index_bam(bamfile, args, logfh=sys.stderr): label = bamfile.rstrip('bam').rstrip('.') outidx = label + '.bai' cmd = picardExe[:] + ['BuildBamIndex', 'I=' + bamfile] if not have_file(outidx, args.force, quiet=True, stderr=logfh): logfh.write("\nIndex bam: " + " ".join(cmd) + "\n") logfh.flush() check_call(cmd, stderr=logfh) if not os.path.isfile(outidx): logfh.write(" Failed to create {}\n".format(outidx)) sys.stderr.write(" Failed to create {}\n".format(outidx)) sys.exit(1)
def index_bam(bamfile, args, logfh=sys.stderr): label = bamfile.rstrip("bam").rstrip(".") outidx = label + ".bai" cmd = picardExe[:] + ["BuildBamIndex", "I=" + bamfile] if not have_file(outidx, args.force, quiet=True, stderr=logfh): logfh.write("\nIndex bam: " + " ".join(cmd) + "\n") logfh.flush() check_call(cmd, stderr=logfh) if not os.path.isfile(outidx): logfh.write(" Failed to create {}\n".format(outidx)) sys.stderr.write(" Failed to create {}\n".format(outidx)) sys.exit(1)
def create_fragment_report(fqpair, frag2primers, read_primer_file, force, logfh): logfh.write("\nCreating fragment report\n".format(read_primer_file)) if have_file(read_primer_file, force, stderr=logfh): logfh.write(" Already have {}\n".format(read_primer_file)) return readnum_patt = re.compile('.*_(R[12]).*') readnums = [readnum_patt.sub('\\1_primer', fqfile) for fqfile in fqpair] logfh.write(" Writing {}\n".format(read_primer_file)) fragcounts = { 'tot_fragments': 0, 'unidentified': 0, 'singleton': 0, 'paired-good': 0, 'paired-other': 0, 'misprime': 0, } with open(read_primer_file, 'w') as ofh: ofh.write("Fragment\t" + "\t".join(readnums) +\ "\tStatus\tEstimated_fragment_size\n") for fragname in sorted(frag2primers.keys()): row = [ fragname, ] for fqfile in fqpair: if fqfile in frag2primers[fragname]['primers']: row.append(frag2primers[fragname]['primers'][fqfile]) else: row.append('') status = frag2primers[fragname]['status'] fragcounts[status] += 1 row.extend([status, frag2primers[fragname]['ampsize']]) ofh.write("\t".join([str(r) for r in row]) + "\n") fragcounts['tot_fragments'] = len(frag2primers.keys()) logfh.write("{:<12}\t{:<6} fragments\t{:<5}%\n".format( "Fragment status", "Number", "Percent")) for k in fragcounts: perc = fragcounts[k] * 100.0 / fragcounts['tot_fragments'] logfh.write("{:<12}\t{:>6} fragments\t{:5.1f}%\n".format( k, fragcounts[k], perc)) logfh.write("{:<12}\t{:>6} fragments\n".format( "Total", fragcounts['tot_fragments'])) fragcounts['paired'] = fragcounts['paired-good'] + \ fragcounts['paired-other'] if os.path.isfile(read_primer_file): sys.stderr.write(" Fragment report {}\n".format(read_primer_file)) return fragcounts
def create_primer_report(primerreads, primerlist, primer_read_file, logfh, force=False, debug=False): """Create file listing each primer and the reads that match it. Include counts of unidentified and mismatched reads. Tally percent of total reads amplified by each primer pair.""" logfh.write("\nCreating primer report\n".format(primer_read_file)) if have_file(primer_read_file, force, stderr=logfh): logfh.write(" Already have {}\n".format(primer_read_file)) return logfh.write(" Writing {}\n".format(primer_read_file)) tot_reads = 0 keylist = primerlist + ['unidentified', 'misprime'] primercounts = {} primerkeys = [] with open(primer_read_file, 'w') as ofh: ofh.write("Primer\tNum_reads\tReads\n") for primer in keylist: numreads = len(primerreads[primer]) tot_reads += numreads readlist = ", ".join(primerreads[primer]) primerpair = primer.rstrip('_F').rstrip('_R') if primerpair in primercounts: primercounts[primerpair] += numreads else: primerkeys.append(primerpair) primercounts[primerpair] = numreads ofh.write("{}\t{}\t{}\n".format(primer, numreads, readlist)) if debug: logfh.write("{:<12}\t{:<6} reads\t{:<5}%\n".format( "Primer", "Number", "Percent")) for k in keylist: numreads = len(primerreads[k]) perc = numreads * 100.0 / tot_reads logfh.write("{:<12}\t{:>6} reads\t{:5.2f}%\n".format( k, numreads, perc)) logfh.write("{:<12}\t{:<6} reads\n".format("Total", tot_reads)) primercounts['tot_reads'] = tot_reads primerkeys.insert(0, 'tot_reads') if os.path.isfile(primer_read_file): sys.stderr.write( " Primer read report {}\n".format(primer_read_file)) return (primercounts, primerkeys)
def table_annovar(annov_input, outlabel, refdir, args): outfile = outlabel + '.hg19_multianno.txt' outfile1 = outlabel + '.variant_function' outfile2 = outlabel + '.exonic_variant_function' (protocol_list, operation_list) = annovar_protocol(annovarDBs, refdir) cmd = [ annovarTableExe, annov_input, refdir, '-out', outlabel, ] cmd.extend(['-buildver', 'hg19', '-out', outlabel, '-nastring', '.']) cmd.extend(['-protocol', protocol_list, '-operation', operation_list]) sys.stderr.write("\nRunning table_annovar: "+" ".join(cmd)+"\n") if have_file(outfile, args.force): sys.stderr.write(" Already have {}.\n".format(outfile)) else: subprocess.check_call(cmd) if not os.path.isfile(outfile): sys.stderr.write(" Failed to create {}\n".format(outfile)) sys.exit(1) return (outfile)
def run_select_variants(vcffile, outvcf, sample, ref, logfh, args): logfh.write("\n-- SelectVariants --\n") cmd = gatkExe[:] cmd.extend(['-T', 'SelectVariants', '--excludeNonVariants']) cmd.extend(['-R', ref]) cmd.extend(['--variant', vcffile]) cmd.extend(['-o', outvcf]) cmd.extend(['-sn', sample]) logfh.write(" Sample {}:\t{}\n".format(sample, outvcf)) if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: logfh.write(" ".join(cmd)+"\n") logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Created {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) return outvcf
def create_primer_report(primerreads, primerlist, primer_read_file, logfh, force=False, debug=False): """Create file listing each primer and the reads that match it. Include counts of unidentified and mismatched reads. Tally percent of total reads amplified by each primer pair.""" logfh.write("\nCreating primer report\n".format(primer_read_file)) if have_file(primer_read_file, force, stderr=logfh): logfh.write(" Already have {}\n".format(primer_read_file)) return logfh.write(" Writing {}\n".format(primer_read_file)) tot_reads = 0 keylist = primerlist + ['unidentified', 'misprime'] primercounts = {} primerkeys = [] with open(primer_read_file, 'w') as ofh: ofh.write("Primer\tNum_reads\tReads\n") for primer in keylist: numreads = len(primerreads[primer]) tot_reads += numreads readlist = ", ".join(primerreads[primer]) primerpair = primer.rstrip('_F').rstrip('_R') if primerpair in primercounts: primercounts[primerpair] += numreads else: primerkeys.append(primerpair) primercounts[primerpair] = numreads ofh.write("{}\t{}\t{}\n".format(primer, numreads, readlist)) if debug: logfh.write("{:<12}\t{:<6} reads\t{:<5}%\n".format( "Primer", "Number", "Percent")) for k in keylist: numreads = len(primerreads[k]) perc = numreads*100.0/tot_reads logfh.write("{:<12}\t{:>6} reads\t{:5.2f}%\n".format(k, numreads, perc)) logfh.write("{:<12}\t{:<6} reads\n".format("Total", tot_reads)) primercounts['tot_reads'] = tot_reads primerkeys.insert(0, 'tot_reads') if os.path.isfile(primer_read_file): sys.stderr.write(" Primer read report {}\n".format(primer_read_file)) return (primercounts, primerkeys)
def run_aligner(queryfiles, primerfa, outlabel, logfh, force): outfile = outlabel + "-primers.cm.out" logfh.write("\nAlignment output in {}\n".format(outfile)) if (have_file(outfile, force, stderr=logfh)): logfh.write(" Already have {}\n".format(outfile)) return (outfile, None) try: with open(outfile, 'w') as ofh: for queryfile in queryfiles: logfh.write(" Aligning {}\n".format(queryfile)) cmd = [CM_EXE, queryfile, primerfa, "-minscore", "12", "-minmatch", "8", "-tags", "-alignments"] logfh.write(" Running {}\n".format(" ".join(cmd))) logfh.flush() subprocess.check_call(cmd, stdout=ofh, stderr=logfh) except subprocess.CalledProcessError as e: sys.stderr.write("Error running cross_match for {}\n".format(outfile)) raise if not os.path.isfile(outfile): sys.stderr.write("Output file {} not found.\n".format(outfile)) return outfile
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, args): """Returns trimmed fastq file and file with list of sequence names""" trimmedfq = outlabel + ".trimmed.fastq" seqfile = outlabel + ".seqlist.txt" logfh.write(" Trimming fq: {}\n".format(trimmedfq)) if have_files([trimmedfq, seqfile], args.force, stderr=logfh): logfh.write(" Already have {}\n".format(trimmedfq)) return (trimmedfq, seqfile) aligns = parse_alignout(alignout) seqlist = [] with open(trimmedfq, 'w') as outfq: inseq = FastQParser(fqfile) for seqrec in inseq: seqlist.append(seqrec.id) if seqrec.id in aligns: primer = aligns[seqrec.id]['primer'] if primerinfo[primer]['overlap']: primerend = aligns[seqrec.id]['end'] +\ aligns[seqrec.id]['left'] subrec = seqrec[primerend:] if args.debug: logfh.write("{}\tTrimming\t{}\n".format( primer, seqrec.id)) else: if args.debug: logfh.write("{}\tNot trimming\t{}\n".format( primer, seqrec.id)) subrec = seqrec else: #trim default max_primer_len+2 subrec = seqrec[max_trim_len:] outfq.write("{}\n".format(subrec.fastq())) logfh.write(" Seq list: {}\n".format(seqfile)) if have_file(seqfile, True, stderr=logfh): logfh.write(" Still have {}\n".format(seqfile)) sys.exit() with open_file(seqfile, 'w') as ifh: ifh.write("\n".join(seqlist)+"\n") return (trimmedfq, seqfile)
def create_fragment_report(fqpair, frag2primers, read_primer_file, force, logfh): logfh.write("\nCreating fragment report\n".format(read_primer_file)) if have_file(read_primer_file, force, stderr=logfh): logfh.write(" Already have {}\n".format(read_primer_file)) return readnum_patt = re.compile('.*_(R[12]).*') readnums = [ readnum_patt.sub('\\1_primer', fqfile) for fqfile in fqpair ] logfh.write(" Writing {}\n".format(read_primer_file)) fragcounts = { 'tot_fragments':0, 'unidentified': 0, 'singleton': 0, 'paired-good': 0, 'paired-other': 0, 'misprime': 0, } with open(read_primer_file, 'w') as ofh: ofh.write("Fragment\t" + "\t".join(readnums) +\ "\tStatus\tEstimated_fragment_size\n") for fragname in sorted(frag2primers.keys()): row = [fragname,] for fqfile in fqpair: if fqfile in frag2primers[fragname]['primers']: row.append(frag2primers[fragname]['primers'][fqfile]) else: row.append('') status = frag2primers[fragname]['status'] fragcounts[status] += 1 row.extend([status, frag2primers[fragname]['ampsize']]) ofh.write("\t".join([str(r) for r in row])+"\n") fragcounts['tot_fragments'] = len(frag2primers.keys()) logfh.write("{:<12}\t{:<6} fragments\t{:<5}%\n".format("Fragment status", "Number", "Percent")) for k in fragcounts: perc = fragcounts[k]*100.0/fragcounts['tot_fragments'] logfh.write("{:<12}\t{:>6} fragments\t{:5.1f}%\n".format(k, fragcounts[k], perc)) logfh.write("{:<12}\t{:>6} fragments\n".format("Total", fragcounts['tot_fragments'])) fragcounts['paired'] = fragcounts['paired-good'] + \ fragcounts['paired-other'] if os.path.isfile(read_primer_file): sys.stderr.write(" Fragment report {}\n".format(read_primer_file)) return fragcounts
def run_freebayes(bamfile, reffile, bedfile, logfh, args): label = get_outlabel(bamfile, args.outdir) outvcf = label + '.freebayes.vcf' cmd = [ freebayesExe, '-f', reffile, '-t', bedfile ] cmd.extend(['-b', bamfile, '-v', outvcf]) # cmd.extend(['--max-complex-gap', '5',]) if args.basequal: cmd.extend(['-q', str(args.basequal)]) if args.minaltcount: cmd.extend(['-C', str(args.minaltcount)]) logfh.write("\nCMD: "+" ".join(cmd)+"\n") if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Freebayes result in {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, args): """Returns trimmed fastq file and file with list of sequence names""" trimmedfq = outlabel + ".trimmed.fastq" seqfile = outlabel + ".seqlist.txt" logfh.write(" Trimming fq: {}\n".format(trimmedfq)) if have_files([trimmedfq, seqfile], args.force, stderr=logfh): logfh.write(" Already have {}\n".format(trimmedfq)) return (trimmedfq, seqfile) aligns = parse_alignout(alignout) seqlist = [] with open(trimmedfq, 'w') as outfq: inseq = FastQParser(fqfile) for seqrec in inseq: seqlist.append(seqrec.id) if seqrec.id in aligns: primer = aligns[seqrec.id]['primer'] if primerinfo[primer]['overlap']: primerend = aligns[seqrec.id]['end'] +\ aligns[seqrec.id]['left'] subrec = seqrec[primerend:] if args.debug: logfh.write("{}\tTrimming\t{}\n".format( primer, seqrec.id)) else: if args.debug: logfh.write("{}\tNot trimming\t{}\n".format( primer, seqrec.id)) subrec = seqrec else: #trim default max_primer_len+2 subrec = seqrec[max_trim_len:] outfq.write("{}\n".format(subrec.fastq())) logfh.write(" Seq list: {}\n".format(seqfile)) if have_file(seqfile, True, stderr=logfh): logfh.write(" Still have {}\n".format(seqfile)) sys.exit() with open_file(seqfile, 'w') as ifh: ifh.write("\n".join(seqlist) + "\n") return (trimmedfq, seqfile)
def run_freebayes(bamfile, reffile, bedfile, logfh, args): label = get_outlabel(bamfile, args.outdir) outvcf = label + '.freebayes.vcf' cmd = [freebayesExe, '-f', reffile, '-t', bedfile] cmd.extend(['-b', bamfile, '-v', outvcf]) # cmd.extend(['--max-complex-gap', '5',]) if args.basequal: cmd.extend(['-q', str(args.basequal)]) if args.minaltcount: cmd.extend(['-C', str(args.minaltcount)]) logfh.write("\nCMD: " + " ".join(cmd) + "\n") if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Freebayes result in {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def create_fasta_of_primer_region(fqfile, max_trim_len, outlabel, logfh, force): """Creates fasta files of the first max_trim_len bases of each sequence in given fqfile. Also, returns a list with names of all sequences in fqfile.""" outfile = outlabel + ".primer_region{}.fa".format(max_trim_len) logfh.write(" Creating 5' fasta: {}\n".format(outfile)) outfiles = [ outfile, ] if have_file(outfile, force, stderr=logfh): logfh.write(" Already have {}\n".format(outfile)) else: logfh.write(" Writing {}\n".format(outfile)) numseqs = 0 totseqs = 0 outfa = open(outfile, 'w') inseq = FastQParser(fqfile) for seqrec in inseq: numseqs += 1 subseq = seqrec.seq[0:max_trim_len] outfa.write(">{}\n{}\n".format(seqrec.id, subseq)) if numseqs == MAX_READS: outfa.close() logfh.write(" Wrote {} seqs\n".format(numseqs)) numfiles = len(outfiles) + 1 outfile = outlabel + ".primer_region{}-{}.fa".format( max_trim_len, numfiles) logfh.write(" Writing {}\n".format(outfile)) outfiles.append(outfile) outfa = open(outfile, 'w') totseqs += numseqs numseqs = 0 logfh.write(" Wrote {} seqs\n".format(numseqs)) if len(outfiles) > 1: totseqs += numseqs logfh.write(" Wrote {} total seqs\n".format(totseqs)) return outfiles
def run_aligner(queryfiles, primerfa, outlabel, logfh, force): outfile = outlabel + "-primers.cm.out" logfh.write("\nAlignment output in {}\n".format(outfile)) if (have_file(outfile, force, stderr=logfh)): logfh.write(" Already have {}\n".format(outfile)) return (outfile, None) try: with open(outfile, 'w') as ofh: for queryfile in queryfiles: logfh.write(" Aligning {}\n".format(queryfile)) cmd = [ CM_EXE, queryfile, primerfa, "-minscore", "12", "-minmatch", "8", "-tags", "-alignments" ] logfh.write(" Running {}\n".format(" ".join(cmd))) logfh.flush() subprocess.check_call(cmd, stdout=ofh, stderr=logfh) except subprocess.CalledProcessError as e: sys.stderr.write("Error running cross_match for {}\n".format(outfile)) raise if not os.path.isfile(outfile): sys.stderr.write("Output file {} not found.\n".format(outfile)) return outfile
def filter_freebayes_vcf(vcffile, outvcf, logfh, args): # cmd = [ vcffilterExe, '-f', "QUAL > 20", "-f", "DP > 10" ] cmd = [vcffilterExe, '-s'] if args.outdir: cmd.extend(['-o', args.outdir]) if args.altbasequal: cmd.extend(['-a', str(args.altbasequal)]) if args.dp: cmd.extend(['-d', str(args.dp)]) if args.qual: cmd.extend(['-q', str(args.qual)]) cmd.append(vcffile) logfh.write("\nCMD: " + " ".join(cmd) + "\n") if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Filtered FreeBayes result {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
def create_fasta_of_primer_region(fqfile, max_trim_len, outlabel, logfh, force): """Creates fasta files of the first max_trim_len bases of each sequence in given fqfile. Also, returns a list with names of all sequences in fqfile.""" outfile = outlabel + ".primer_region{}.fa".format(max_trim_len) logfh.write(" Creating 5' fasta: {}\n".format(outfile)) outfiles = [outfile,] if have_file(outfile, force, stderr=logfh): logfh.write(" Already have {}\n".format(outfile)) else: logfh.write(" Writing {}\n".format(outfile)) numseqs = 0 totseqs = 0 outfa = open(outfile, 'w') inseq = FastQParser(fqfile) for seqrec in inseq: numseqs += 1 subseq = seqrec.seq[0:max_trim_len] outfa.write(">{}\n{}\n".format(seqrec.id, subseq)) if numseqs==MAX_READS: outfa.close() logfh.write(" Wrote {} seqs\n".format(numseqs)) numfiles = len(outfiles) + 1 outfile = outlabel + ".primer_region{}-{}.fa".format( max_trim_len, numfiles) logfh.write(" Writing {}\n".format(outfile)) outfiles.append(outfile) outfa = open(outfile, 'w') totseqs += numseqs numseqs = 0 logfh.write(" Wrote {} seqs\n".format(numseqs)) if len(outfiles)>1: totseqs += numseqs logfh.write(" Wrote {} total seqs\n".format(totseqs)) return outfiles
def filter_freebayes_vcf(vcffile, outvcf, logfh, args): # cmd = [ vcffilterExe, '-f', "QUAL > 20", "-f", "DP > 10" ] cmd = [ vcffilterExe, '-s'] if args.outdir: cmd.extend(['-o', args.outdir]) if args.altbasequal: cmd.extend(['-a', str(args.altbasequal)]) if args.dp: cmd.extend(['-d', str(args.dp)]) if args.qual: cmd.extend(['-q', str(args.qual)]) cmd.append(vcffile) logfh.write("\nCMD: "+" ".join(cmd)+"\n") if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: logfh.flush() check_call(cmd, stderr=logfh) if os.path.isfile(outvcf): sys.stderr.write(" Filtered FreeBayes result {}\n".format(outvcf)) else: sys.stderr.write(" Failed to create {}\n".format(outvcf)) sys.exit(1) return outvcf
help="Name for summary file.") parser.add_argument("-o", "--outdir", help="Directory for output files.") parser.add_argument("-f", "--force", default=False, action='store_true', help="Overwrite existing files.") parser.add_argument("-p", "--processes", default=1, type=int, help="Number of processes to run.") parser.add_argument("--debug", default=False, action='store_true', help="Debug mode.") parser.add_argument("--logdir", help="Directory for log files.") if len(sys.argv)<2: parser.print_help() sys.exit() args = parser.parse_args() primerfa = RESOURCE['primer_fa'] if not os.path.isfile(primerfa): sys.exit("Could not find resource {}\n".format(primerfa)) (primerinfo, max_primer_len) = primer_info(primerfa) find_overlaps(primerinfo, args.debug) outfiles = align_trim_all_fqfiles(args.fqfiles, primerfa, primerinfo, max_primer_len, args) if args.outdir: args.summary = os.path.join(args.outdir, args.summary) if have_file(args.summary, args.force): sys.stderr.write("Already have {}.\n".format(args.summary)) else: samplecounts = assess_all_primers(args.fqfiles, outfiles, primerinfo, args) print_summary(samplecounts, args.summary)
" .variant_function and" +\ " .exonic_variant_function files.", ) parser.add_argument("-o", "--outdir", help="Directory for output file.",) parser.add_argument("-l", "--label", help="Label for output file.",) parser.add_argument("-f", "--force", default=False, action='store_true', help="Overwrite existing files.") if len(sys.argv)<2: parser.print_help() sys.exit() args = parser.parse_args() outlabel = get_outlabel(args.vcffiles, args) outfile = outlabel + ".results.txt" rejectfile = outlabel + ".rejects.txt" if have_file(outfile, args.force): sys.stderr.write(" Already have {}.\n".format(outfile)) else: flatdata = defaultdict(dict) for vcffile in args.vcffiles: (fields, vcfdata) = parse_vcf(vcffile) (flatdata, newfields) = flatten_vcf_data(fields, vcfdata, flatdata) annovar_data = get_annovar_data(args.annovar) bedfile = cftr.RESOURCE['analysis_roi_bed'] if not os.path.isfile(bedfile): sys.exit("BED file {} not found\n".format(bedfile)) roi = cftr.parse_bedfile(bedfile) create_spreadsheet(newfields, flatdata, annovar_data, roi, outfile, rejectfile, args)
parser.add_argument("-s", "--dbsnp", default=False, action='store_true', help="Add dbsnp to ID field.",) parser.add_argument("-l", "--label", default='filtered', help="Label for output files.",) parser.add_argument("-f", "--force", default=False, action='store_true', help="Overwrite existing files.") if len(sys.argv)<2: parser.print_help() sys.exit() args = parser.parse_args() dbsnp = parse_dbsnp() if args.dbsnp else {} pos_removed = [] for vcffile in args.vcffiles: sys.stderr.write("Processing {}.\n".format(vcffile)) outlabel = os.path.basename(vcffile).rstrip('vcf').rstrip('.') outvcf = outlabel + '.{}.vcf'.format(args.label) if args.outdir: outvcf = os.path.join(args.outdir, outvcf) if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: sys.stderr.write("Writing {}\n".format(outvcf)) (header, fields, vcfinfo) = parse_vcf(vcffile) if args.dbsnp: add_dbsnp(vcfinfo, dbsnp) pos_rm = filter_vcf(header, fields, vcfinfo, outvcf, dbsnp, args) pos_removed.extend(pos_rm) sys.stderr.write("\nPositions removed: {}\n".format(", ".join( [str(j) for j in sorted([int(i) for i in set(pos_removed)])])))
# parser.add_argument("mol2k", help="Mol2k file of variants.") parser.add_argument("spreadsheets", nargs="+", help="Spreadsheet to check") parser.add_argument("-o", "--outdir", help="Directory for output.") parser.add_argument("-s", "--seen", default=False, action="store_true", help="Reset Seen counter.") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite existing files.") if len(sys.argv) < 2: parser.print_help() sys.exit() args = parser.parse_args() mol2kfile = cftr.RESOURCE["cftr_db"] if not os.path.isfile(mol2kfile): sys.exit("Could not find mol2k resource file: {}\n".format(mol2kfile)) (mol2k, mol2k_fields) = parse_mol2k(mol2kfile, args.seen) seenfile = os.path.basename(mol2kfile).rstrip("txt").rstrip("csv").rstrip(".") + ".seen.txt" if args.outdir: seenfile = os.path.join(args.outdir, seenfile) for spreadsheet in args.spreadsheets: outfile = spreadsheet.rstrip("txt").rstrip(".") + ".mol2k.txt" if args.outdir: outfile = os.path.join(args.outdir, os.path.basename(outfile)) if have_file(outfile, args.force): sys.stderr.write(" Already have {}.\n".format(outfile)) else: find_mol2k_variants(spreadsheet, mol2k, outfile) if have_file(seenfile, args.force): sys.stderr.write(" Already have {}.\n".format(seenfile)) else: print_mol2k_seen(mol2k, mol2k_fields, seenfile)
help="Label for output files.") parser.add_argument("--debug", action="store_true", default=False, help="Debug mode.") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite existing files.") if len(sys.argv)<2: parser.print_help() sys.exit() args = parser.parse_args() outfile = args.outlabel + ".tgpolyt_counts.txt" summaryfile = args.outlabel + ".tgpolyt.txt" sys.stderr.write("Writing {}\n".format(outfile)) sys.stderr.write("Writing {}\n".format(summaryfile)) if have_file(outfile, args.force) and have_file(summaryfile, args.force): sys.stderr.write("Already have {} and {}\n".format( outfile, summaryfile)) sys.exit() outfields = ['sample', 'TG-polyT', 'frequency', 'num_reads', 'tot_reads'] summfields = ['sample', 'TG-polyT', 'hom_het', 'frequency', 'num_reads'] with open(outfile, 'w') as ofh, open(summaryfile, 'w') as sfh: ofh.write("\t".join(outfields)+"\n") sfh.write("\t".join(summfields)+"\n") for bamfile in args.bamfiles: sample = get_samplename(bamfile) sys.stderr.write("\nReading bam file: {}\n".format(bamfile)) sys.stderr.write(" Sample: {}\n".format(sample)) reads = get_reads_covering_region(bamfile, REGION)
parser = ArgumentParser(description=descr) parser.add_argument("primer2readsdir", help="Directory with primer2reads.txt files.") parser.add_argument("bamfiles", nargs="+", help="Bam files.") parser.add_argument("-o", "--outfile", default="uniformity.txt", help="Name for output file.") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite existing files.") parser.add_argument("-d", "--debug", default=False, action="store_true", help="Keep intermediate files.") if len(sys.argv)<2: parser.print_help() sys.exit() args = parser.parse_args() if have_file(args.outfile, args.force): sys.stderr.write(" Already have {}\n".format(args.outfile)) sys.exit() roi = parse_roi() p2rdict = get_primer2reads_files(args.primer2readsdir) ampcov = {} for bamfile in sorted(args.bamfiles): sample = os.path.basename(bamfile).split('-')[0].split('.')[0] if not sample in p2rdict: sys.stderr.write("No primer2read file for {}\n".format(bamfile)) continue p2r = parse_primer2reads_file(p2rdict[sample]) ampcov[sample] = get_amplicon_coverage(sample, bamfile, p2r, roi) compile_data(ampcov, roi, args.outfile)
) parser.add_argument("-f", "--force", default=False, action='store_true', help="Overwrite existing files.") if len(sys.argv) < 2: parser.print_help() sys.exit() args = parser.parse_args() dbsnp = parse_dbsnp() if args.dbsnp else {} pos_removed = [] for vcffile in args.vcffiles: sys.stderr.write("Processing {}.\n".format(vcffile)) outlabel = os.path.basename(vcffile).rstrip('vcf').rstrip('.') outvcf = outlabel + '.{}.vcf'.format(args.label) if args.outdir: outvcf = os.path.join(args.outdir, outvcf) if have_file(outvcf, args.force): sys.stderr.write(" Already have {}.\n".format(outvcf)) else: sys.stderr.write("Writing {}\n".format(outvcf)) (header, fields, vcfinfo) = parse_vcf(vcffile) if args.dbsnp: add_dbsnp(vcfinfo, dbsnp) pos_rm = filter_vcf(header, fields, vcfinfo, outvcf, dbsnp, args) pos_removed.extend(pos_rm) sys.stderr.write("\nPositions removed: {}\n".format(", ".join( [str(j) for j in sorted([int(i) for i in set(pos_removed)])])))
parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite existing files.") if len(sys.argv) < 2: parser.print_help() sys.exit() args = parser.parse_args() outfile = args.outlabel + ".tgpolyt_counts.txt" summaryfile = args.outlabel + ".tgpolyt.txt" sys.stderr.write("Writing {}\n".format(outfile)) sys.stderr.write("Writing {}\n".format(summaryfile)) if have_file(outfile, args.force) and have_file(summaryfile, args.force): sys.stderr.write("Already have {} and {}\n".format( outfile, summaryfile)) sys.exit() outfields = ['sample', 'TG-polyT', 'frequency', 'num_reads', 'tot_reads'] summfields = ['sample', 'TG-polyT', 'hom_het', 'frequency', 'num_reads'] with open(outfile, 'w') as ofh, open(summaryfile, 'w') as sfh: ofh.write("\t".join(outfields) + "\n") sfh.write("\t".join(summfields) + "\n") for bamfile in args.bamfiles: sample = get_samplename(bamfile) sys.stderr.write("\nReading bam file: {}\n".format(bamfile)) sys.stderr.write(" Sample: {}\n".format(sample)) reads = get_reads_covering_region(bamfile, REGION) (tgpolyt, totreads) = count_tgpolyt(reads, REGION, args.debug) report_results(ofh, sfh, sample, tgpolyt, totreads, args)
"--processes", default=1, type=int, help="Number of processes to run.") parser.add_argument("--debug", default=False, action='store_true', help="Debug mode.") parser.add_argument("--logdir", help="Directory for log files.") if len(sys.argv) < 2: parser.print_help() sys.exit() args = parser.parse_args() primerfa = RESOURCE['primer_fa'] if not os.path.isfile(primerfa): sys.exit("Could not find resource {}\n".format(primerfa)) (primerinfo, max_primer_len) = primer_info(primerfa) find_overlaps(primerinfo, args.debug) outfiles = align_trim_all_fqfiles(args.fqfiles, primerfa, primerinfo, max_primer_len, args) if args.outdir: args.summary = os.path.join(args.outdir, args.summary) if have_file(args.summary, args.force): sys.stderr.write("Already have {}.\n".format(args.summary)) else: samplecounts = assess_all_primers(args.fqfiles, outfiles, primerinfo, args) print_summary(samplecounts, args.summary)
"--force", default=False, action='store_true', help="Overwrite existing files.") if len(sys.argv) < 2: parser.print_help() sys.exit() args = parser.parse_args() mol2kfile = cftr.RESOURCE['cftr_db'] if not os.path.isfile(mol2kfile): sys.exit("Could not find mol2k resource file: {}\n".format(mol2kfile)) (mol2k, mol2k_fields) = parse_mol2k(mol2kfile, args.seen) seenfile = os.path.basename(mol2kfile).rstrip('txt').\ rstrip('csv').rstrip('.') + ".seen.txt" if args.outdir: seenfile = os.path.join(args.outdir, seenfile) for spreadsheet in args.spreadsheets: outfile = spreadsheet.rstrip('txt').rstrip('.') + ".mol2k.txt" if args.outdir: outfile = os.path.join(args.outdir, os.path.basename(outfile)) if have_file(outfile, args.force): sys.stderr.write(" Already have {}.\n".format(outfile)) else: find_mol2k_variants(spreadsheet, mol2k, outfile) if have_file(seenfile, args.force): sys.stderr.write(" Already have {}.\n".format(seenfile)) else: print_mol2k_seen(mol2k, mol2k_fields, seenfile)