def calculate_depths(label, infiles, args): bamfile = infiles[label]['bam'] vcffile = infiles[label]['vcf'] outlabel = get_outlabel(bamfile, 'bam', args.outdir) loglabel = get_outlabel(bamfile, 'bam', args.logdir) if args.logdir \ else outlabel logfile = loglabel + ".calculate_depths.log" outfile = outlabel + '.depths{}.txt'.format(args.tag) sys.stderr.write(" Calculating depths for {}\n".format(outlabel)) sys.stderr.write(" Log file: {}\n".format(logfile)) logfh = open(logfile, 'w') logfh.write("Bam file: {}\n".format(bamfile)) logfh.write("VCF file: {}\n\n".format(vcffile)) try: if have_file(outfile, args.force, stderr=logfh): logfh.write(" Already have {}\n".format(outfile)) else: logfh.write("Start time: {}\n".format(timestamp())) (vcffields, variants) = parse_vcf(vcffile, logfh) depths = find_variant_depths(variants, bamfile, logfh, args) add_fwd_rev_depths(depths, variants, bamfile, args) print_depths(outfile, variants, depths, vcffields, logfh) logfh.write("End time: {}\n".format(timestamp())) except Exception, e: e.args += (vcffile, ) raise
def call_variants_gatk(bamfile, ref, args): outlabel = get_outlabel(bamfile, args.outdir) loglabel = get_outlabel(bamfile, args.logdir if args.logdir else args.outdir) logfile = loglabel + ".gatk.log" logfh = open(logfile, "w") sys.stderr.write(" Running GATK on {}\n".format(bamfile)) sys.stderr.write(" Log file {}\n".format(logfile)) try: logfh.write("Start time: {}\n".format(timestamp())) index_bam(bamfile, args, logfh) vcffile = run_gatk(bamfile, outlabel, ref, args, logfh) if os.path.isfile(vcffile): sys.stderr.write(" VCF file: {}\n".format(vcffile)) logfh.write("Finish time: {}\n".format(timestamp())) except Exception, e: e.args += (bamfile,) sys.stderr.write("Error running gatk." + " Check log file {}\n".format(logfile)) raise
def call_variants_freebayes(bamfile, reffile, bedfile, args): sys.stderr.write("Bam file: {}\n".format(bamfile)) if args.logdir: label = get_outlabel(bamfile, args.logdir) outlog = label +'.freebayes.log' logfh = open(outlog, 'w') else: logfh = sys.stderr tag = "{}:\t".format(get_outlabel(bamfile)) logfh.write("{}start time {}\n".format(tag, timestamp())) vcffile = run_freebayes(bamfile, reffile, bedfile, logfh, args) outvcf = vcffile.replace('.vcf', '')+".filtered.vcf" filteredvcf = filter_freebayes_vcf(vcffile, outvcf, logfh, args) vcfs.append(filteredvcf) logfh.write("{}finish time {}\n".format(tag, timestamp())) if args.logdir: logfh.close() return vcfs
def align_create_bam(sample, fqfiles, ref, reflabel, args): outlabel = "{}-{}".format(sample, reflabel) loglabel = os.path.join(args.logdir, outlabel) if args.logdir \ else outlabel logfile = loglabel + ".bwa.log" sys.stderr.write(" Mapping {}\n".format(sample)) sys.stderr.write(" Log file {}\n".format(logfile)) if args.outdir: outlabel = os.path.abspath(os.path.join(args.outdir, outlabel)) logfh = open(logfile, 'w') try: logfh.write("Start time {}\n".format(timestamp())) samfile = run_bwa(sample, outlabel, ref, fqfiles, logfh, args.force) bamfile = create_sorted_bam(samfile, outlabel, logfh, args.force, args.sam) logfh.write("Finish time {}\n".format(timestamp())) except Exception, e: e.args += (sample, ) sys.stderr.write("Error running bwa." +\ " Check log file {}\n".format(logfile)) raise
def gatk_pipeline(label, bamfiles, gatkdir, logdir, args): cmd = [SCRIPTS['gatk_pipeline'], '-l', label, '-o', gatkdir,] cmd += ['-p', str(args.processes)] if logdir: cmd += ['--logdir', logdir] if args.maxreads: cmd += ['-m', str(args.maxreads)] if args.force: cmd += ['-f',] cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running gatk pipeline: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except Exception, e: raise
def align_and_trim(fqfile, primerfa, primerinfo, max_trim_len, args): outlabel = fastq_file_label(fqfile, args.outdir) loglabel = fastq_file_label(fqfile, args.logdir) if args.logdir \ else outlabel logfile = loglabel + ".primer_trim.log" sys.stderr.write(" Trimming {}\n".format(os.path.basename(fqfile))) sys.stderr.write(" Log file {}\n".format(logfile)) logfh = open(logfile, 'w') try: logfh.write("Start time: {}\n".format(timestamp())) fafiles = create_fasta_of_primer_region(fqfile, max_trim_len, outlabel, logfh, args.force) alignout = run_aligner(fafiles, primerfa, outlabel, logfh, args.force) (trimfq, seqnamefile) = trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, args) if os.path.isfile(trimfq): sys.stderr.write(" Created {}\n".format(trimfq)) logfh.write("Finish time: {}\n".format(timestamp())) except Exception, e: e.args += (fqfile, ) sys.stderr.write("Error trimming primers." +\ " Check log file {}\n".format(logfile)) raise
def run_annovar(vcffiles, outdir, args): cmd = [SCRIPTS['run_annovar'], '-o', outdir] if args.label: cmd += ['-l', args.label + '.annovar'] if args.force: cmd += ['-f',] cmd.extend(vcffiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running annovar: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise except Exception, e: raise
def annotate_spreadsheets(spreadsheets, outdir, args): cmd = [SCRIPTS['add_mol2k'], '-o', outdir] if args.force: cmd += ['-f',] cmd.extend(spreadsheets) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Annotating spreadsheets: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise outfiles = find_outfiles(outdir, spreadsheets, 'mol2k.txt', delim='.results') if not outfiles: sys.exit(1) return outfiles
def trim_all_fastq(fqfiles, aligndir, logdir, args): cmd = [SCRIPTS['primer_trim'], '-o', aligndir,] cmd += ['-p', str(args.processes)] if logdir: cmd += ['--logdir', logdir] if args.force: cmd += ['-f',] if args.label: cmd += ['-s', args.label + '.summary.txt' ] cmd.extend(fqfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Trimming fastq files: "+" ".join(cmd)+"\n") outfiles = find_outfiles(aligndir, fqfiles, 'trimmed.fastq', quiet=True) if len(outfiles)==len(fqfiles): sys.stderr.write("Already have trimmed fq files.\n") return outfiles try: subprocess.check_call(cmd) except Exception, e: raise
def create_spreadsheet(label, samplevcfs, annovar_out, outdir): cmd = [SCRIPTS['create_spreadsheet'], '-l', label,'-o', outdir] for afile in annovar_out: cmd += ['-a', afile] if args.force: cmd += ['-f',] cmd.extend(samplevcfs) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Creating spreadsheet: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise outfile = find_outfiles(outdir, [label,], 'results.txt', debug=args.debug) if not outfile: sys.exit(1) return outfile[0]
def separate_vcf(vcf, gatkdir, bamfiles, args): cmd = [SCRIPTS['separate_vcf'], '-o', gatkdir, ] cmd += ['--logdir', args.logdir] if args.force: cmd += ['-f',] cmd.append(vcf) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Separating gVCF: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise samplevcfs = find_outfiles(gatkdir, bamfiles, 'separated.vcf', delim='-', debug=args.debug) if not samplevcfs: sys.exit(1) return samplevcfs
def run_freebayes(bamfiles, fbdir, args): cmd = [SCRIPTS['run_freebayes'], '-o', fbdir,] cmd += ['-p', str(args.processes)] if args.logdir: cmd += ['--logdir', args.logdir] if args.force: cmd += ['-f',] cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running freebayes: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: sys.stderr.write("Error running freebayes pipeline.") raise outfiles = find_outfiles(fbdir, bamfiles, 'freebayes.filtered.vcf', debug=args.debug) if not outfiles: sys.stderr.write("No freebayes .filtered.vcf files found.\n") sys.exit(1) return (outfiles)
def run_annovar(vcffiles, outdir, args): cmd = [SCRIPTS['run_annovar'], '-o', outdir] if args.label: cmd += ['-l', args.label + '.annovar'] if args.force: cmd += ['-f',] cmd.extend(vcffiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running annovar: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise vffile = find_outfiles(outdir, [args.label,], 'annovar-hgvs.variant_function', debug=args.debug) evffile = find_outfiles(outdir, [args.label,], 'annovar-hgvs.exonic_variant_function', debug=args.debug) if not vffile or not evffile: sys.exit(1) return (vffile[0], evffile[0])
def run_freebayes(bamfiles, fbdir, logdir, args): cmd = [SCRIPTS['run_freebayes'], '-o', fbdir,] cmd += ['-p', str(args.processes)] if args.force: cmd += ['-f',] cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running freebayes: "+" ".join(cmd)+"\n") logfile = os.path.join(logdir, args.label + '.freebayes.log') try: with open(logfile, 'w') as logfh: subprocess.check_call(cmd, stderr=logfh) except subprocess.CalledProcessError as e: sys.stderr.write("Error running freebayes."+\ " Please check logfile {}\n".format(logfile)) raise vcffiles = find_outfiles(fbdir, bamfiles, 'freebayes.filtered.vcf') if not vcffiles: sys.stderr.write("No freebayes .filtered.vcf files found.\n") sys.exit(1) return (vcffiles)
def calculate_depths(vcffiles, bamfiles, gatkdir, args): cmd = [SCRIPTS['calculate_depths'], '-o', gatkdir, ] if args.logdir: cmd += ['--logdir', args.logdir] cmd += ['-p', str(args.processes)] if args.force: cmd += ['-f',] cmd.extend(vcffiles) cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Calculating depths: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise outfiles = find_outfiles(gatkdir, bamfiles, 'depths.txt', debug=args.debug) if not outfiles: sys.stderr.write("No depths.txt files found.\n") sys.exit(1) return outfiles
def run_bwa_all_fastq(fqfiles, aligndir, logdir, args): cmd = [SCRIPTS['run_bwa'], '-o', aligndir, '-p', str(args.processes)] if logdir: cmd += ['--logdir', logdir] if args.force: cmd += ['-f',] if args.sam: cmd += ['-s',] cmd.extend(fqfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running bwa: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: sys.stderr.write("Error running bwa.\n") raise delim = '_L001' if '_L001' in fqfiles[0] else '_R' ext = 'genomic_refseq.bam' outfiles = find_outfiles(aligndir, [ f for f in fqfiles if 'R1' in f ], ext, delim=delim) if not outfiles: sys.stderr.write("No bam files found.\n") sys.exit(1) return outfiles
def gatk_pipeline(label, bamfiles, gatkdir, logdir, args): cmd = [ SCRIPTS['gatk_pipeline'], '-l', label, '-o', gatkdir, ] cmd += ['-p', str(args.processes)] if logdir: cmd += ['--logdir', logdir] if args.maxreads: cmd += ['-m', str(args.maxreads)] if args.force: cmd += [ '-f', ] cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running gatk pipeline: " + " ".join(cmd) + "\n") try: subprocess.check_call(cmd) except Exception, e: raise
def create_spreadsheet(label, samplevcfs, annovar_out, outdir): cmd = [SCRIPTS['create_spreadsheet'], '-l', label, '-o', outdir] for afile in annovar_out: cmd += ['-a', afile] if args.force: cmd += [ '-f', ] cmd.extend(samplevcfs) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Creating spreadsheet: " + " ".join(cmd) + "\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise outfile = find_outfiles(outdir, [ label, ], 'results.txt', debug=args.debug) if not outfile: sys.exit(1) return outfile[0]
def run_gatk_all_bam(label, bamfiles, gatkdir, args): cmd = [SCRIPTS['run_gatk'], '-c', label, '-o', gatkdir,] cmd += ['-p', str(args.processes)] cmd += ['--logdir', args.logdir] if args.force: cmd += ['-f',] if args.debug: cmd += ['--debug',] if args.maxreads: cmd += ['-m', str(args.maxreads)] cmd.extend(bamfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Running gatk: "+" ".join(cmd)+"\n") try: subprocess.check_call(cmd) except subprocess.CalledProcessError as e: print e.output raise outfiles = find_outfiles(gatkdir, bamfiles, 'gatk-cohort.vcf', debug=args.debug) gvcf = find_outfiles(gatkdir, [label,], 'gatk-merged.vcf', debug=args.debug) if not outfiles: sys.stderr.write("No gatk-cohort.vcf files found.\n") sys.exit(1) return (outfiles, gvcf[0])
def trim_all_fastq(fqfiles, aligndir, logdir, args): cmd = [ SCRIPTS['primer_trim'], '-o', aligndir, ] cmd += ['-p', str(args.processes)] if logdir: cmd += ['--logdir', logdir] if args.force: cmd += [ '-f', ] if args.label: cmd += ['-s', args.label + '.summary.txt'] cmd.extend(fqfiles) sys.stderr.write("\n==================================================\n") sys.stderr.write("Current time: {}\n".format(timestamp())) sys.stderr.write("Trimming fastq files: " + " ".join(cmd) + "\n") outfiles = find_outfiles(aligndir, fqfiles, 'trimmed.fastq', quiet=True) if len(outfiles) == len(fqfiles): sys.stderr.write("Already have trimmed fq files.\n") return outfiles try: subprocess.check_call(cmd) except Exception, e: raise