def batch_qc(fn, cont, obj, t): if len(sys.argv) == 1: parser.print_help() sys.exit(1) inputs = parser.parse_args() fh = open(inputs.fn, 'r') src_cmd = '. ~/.novarc;' jobs = [] for line in fh: line = line.rstrip('\n') # All files for current bnid to be stored in cwd swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line sys.stderr.write(date_time() + 'Checking for sequence files for sample ' + line + '\n' + swift_cmd + '\n') try: contents = subprocess.check_output(swift_cmd, shell=True) if len(contents) < len(line): sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue except: sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents) sf1 = seqfile[0] end1 = os.path.basename(sf1) sf2 = seqfile[1] end2 = os.path.basename(sf2) swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2 fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2 upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2 jobs.append(';'.join([swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd])) sys.stderr.write(date_time() + 'Job list created, running jobs!\n') job_manager(jobs, t) return 0
def qc_bam_pipe(sample_list, config_file, ref_mnt): (cont, obj) = parse_config(config_file) job_list = [] log_dir = 'LOGS/' src_cmd = '. ~/.novarc;' create_start_dirs = 'mkdir LOGS QC' subprocess.call(create_start_dirs, shell=True) for sample in open(sample_list): sample = sample.rstrip('\n') parts = sample.split('_') bam = sample + '.Aligned.sortedByCoord.out.bam' dl_list = (log_dir + sample + '.cutadapt.log', log_dir + sample + '.Log.final.out', 'QC/' + sample + '_subset.insert_metrics.hist', 'QC/' + sample + '_1_sequence_fastqc/fastqc_data.txt', 'BAMS/' + bam) dl_cmd = src_cmd prefix = obj + '/' + parts[0] + '/' for fn in dl_list: dl_cmd += 'swift download ' + cont + ' ' + prefix + fn + ';' mv_cmd = 'mv ' + prefix + dl_list[2] + ' .;mv ' + prefix + dl_list[4] + ' .;mv ' + prefix + dl_list[0] \ + ' LOGS/;mv ' + prefix + dl_list[1] + ' LOGS;mv ' + prefix + 'QC/' + sample + '* QC/;' qc_cmd = '~/TOOLS/Scripts/alignment/qc_bam.py -sa ' + sample + ' -j ' + config_file + ' -m ' + ref_mnt + ';' rm_cmd = 'rm ' + bam + ';' parse_qc_cmd = '~/TOOLS/Scripts/alignment/parse_qc.py -j ' + config_file + ' -sa ' + sample + ';' full_cmd = dl_cmd + mv_cmd + qc_cmd + rm_cmd + parse_qc_cmd job_list.append(full_cmd) job_manager(job_list, 4)
def snpeff_pipe(config_file, sample_pairs, ref_mnt, cflag): # edit to grab from config max thread count max_t = 8 (java, snpeff, snpsift, report, dbsnp, intervals) = parse_config(config_file) dbsnp = ref_mnt + '/' + dbsnp intervals = ref_mnt + '/' + intervals fh = open(sample_pairs) mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) cmd_list = [] run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 ' for line in fh: line = line.rstrip('\n') (sample, tumor_id, normal_id) = line.split('\t') # run snpsift first, then snpeff run_report = report + ' -i ' + sample + '.out.keep.eff.vcf -c ' if cflag == 'n': run_report += intervals else: run_report += 'n' run_report += ' > ' + sample + '.vcf.keep.eff.xls' run_snp = run_snpsift + ' ' + sample + '.out.keep > ' + sample + '.out.keep.sift.vcf 2> LOGS/' + sample \ + '.snpeff.log;' + run_snpeff + ' ' + sample + '.out.keep.sift.vcf -v > ' + sample \ + '.out.keep.eff.vcf 2>> LOGS/' + sample + '.snpeff.log;' + run_report cmd_list.append(run_snp) job_manager(cmd_list, max_t) sys.stderr.write(date_time() + 'SNP annotation completed!\n') return 0
def calc_pos_cov(table, samtools, out): fh = open(table) head = next(fh) bnids = [] head = head.rstrip('\n').split('\t') for i in range(1, len(head), 1): bnid = head[i].split('_') bnids.append(bnid[0]) # create bed file to get coverage bed_fn = out + '.bed' bed = open(bed_fn, 'w') vlist = [] # in the event an indel happens in one sample and snv in another at same position, don't process twice for line in fh: vlist.append(create_bed(line, bed)) bed.close() fh.close() job_list = [] src_cmd = '. ~/.novarc;' # get bams, then build jobs for i in range(0, len(bnids), 1): sys.stderr.write(date_time() + 'Getting bam for ' + bnids[i] + '\n') bam = 'ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.bam' dl_cmd = src_cmd + 'swift download PDX --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.ba;' subprocess.call(dl_cmd, shell=True) # try pdx container, if not, try pancan if os.path.isfile(bam): job_list.append(build_jobs(samtools, bed_fn, bnids[i])) else: sys.stderr.write(date_time() + dl_cmd + '\nBam for sample ' + bnids[i] + ' not in PDX contaner, ' 'trying PANCAN\n') dl_cmd = src_cmd + 'swift download PANCAN --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] \ + '.merged.final.ba;' subprocess.call(dl_cmd, shell=True) if os.path.isfile(bam): job_list.append(build_jobs(samtools, bed_fn, bnids[i])) else: sys.stderr.write(date_time() + dl_cmd + '\nCould not find bam for ' + bnids[i] + '\n') exit(1) sys.stderr.write('Running depth jobs\n') job_manager(job_list, '8') sys.stderr.write(date_time() + 'Compiling results\n') cov_dict = compile_results(bnids) sys.stderr.write(date_time() + 'Writing to output table\n') out_fh = open(out + '_variant_coverage_table.txt', 'w') out_fh.write('\t'.join(head) + '\n') for var in vlist: out_fh.write(var) for i in range(0, len(bnids), 1): m = re.search('\S+-(chr\w+)_(\d+)_\w+->\w+', var) (chrom, pos) = (m.group(1), m.group(2)) if bnids[i] in cov_dict[chrom][pos]: out_fh.write('\t' + cov_dict[chrom][pos][bnids[i]]) else: out_fh.write('\t0') out_fh.write('\n') out_fh.close() sys.stderr.write(date_time() + 'Fin\n')
def cnv_pipe(config_file, tum_bam, norm_bam, o_flag, project2): (project_dir, project, bedtools, ana, bed, cores) = parse_config(config_file) job_list = [] tum_id = re.match('(\d+-\d+)\.', os.path.basename(tum_bam)) tum_id = tum_id.group(1) norm_id = re.match('(\d+-\d+)\.', os.path.basename(norm_bam)) norm_id = norm_id.group(1) pair = tum_id + '_' + norm_id bed_t1 = bed.replace('.bed', '_t1.bed') bed_t2 = bed.replace('.bed', '_t2.bed') t1_genes = get_genes(bed_t1) t2_genes = get_genes(bed_t2) t1_suffix = '.t1.bedtools.coverage.txt' t2_suffix = '.t2.bedtools.coverage.txt' cnv_dir = project_dir + project + '/' + ana + '/' + pair + '/OUTPUT/' if not os.path.isdir(cnv_dir): sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Trying with backup project ' + project2 + '\n') cnv_dir = project_dir + project2 + '/' + ana + '/' + pair + '/OUTPUT/' if not os.path.isdir(cnv_dir): sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Check config and try again!\n') exit(1) clist = (tum_bam + ' -b ' + bed_t1 + ' > ', tum_bam + ' -b ' + bed_t2 + ' > ', norm_bam + ' -b ' + bed_t1 + ' > ', norm_bam + ' -b ' + bed_t2 + ' > ') flist = (cnv_dir + tum_id + t1_suffix, cnv_dir + tum_id + t2_suffix, cnv_dir + norm_id + t1_suffix, cnv_dir + norm_id + t2_suffix) if o_flag == 'y': sys.stderr.write(date_time() + 'Overwrite yes given, creating new coverage files\n') for i in range(0, len(flist)): job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i]) else: sys.stderr.write( date_time() + 'Overwrite no given, checking for existing coverage files first\n') for i in range(0, len(flist)): if not os.path.isfile(flist[i]): job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i]) sys.stderr.write(date_time() + 'Calculating read depth for ' + pair + '\n') job_manager(job_list, cores) # process coverage files, assess cnv sys.stderr.write(date_time() + 'Collapsing read counts in to tiers and gene\n') calc_tn_cov_ratios(cnv_dir, tum_id, norm_id, t1_genes, t2_genes, t1_suffix, t2_suffix) sys.stderr.write(date_time() + 'CNV analysis complete for ' + pair + '\n') return 0
def convert_vcf(config_file, sample_pairs, suffix): (java, sift, th) = parse_config(config_file) cmd_list = [] for pair in open(sample_pairs, 'r'): pair = pair.rstrip('\n').split('\t') pair = pair[0] in_vcf = pair + '/' + pair + suffix out_xls = pair + '/' + pair + '.indels.xls' cmd = java + ' -jar ' + sift + ' extractFields ' + in_vcf + \ ' CHROM POS REF ALT "EFF[0].EFFECT" "EFF[0].FUNCLASS" "EFF[0].CODON" "EFF[0].AA" "EFF[0].AA_LEN" ' \ '"EFF[0].GENE" "EFF[0].BIOTYPE" "EFF[0].CODING" MINCOV ALTCOV COVRATIO ID > ' + out_xls cmd_list.append(cmd) job_manager(cmd_list, th)
def varscan_germline(config_file, sample, ref_mnt): (samtools, varscan, region, fasta, th) = parse_config(config_file) region = ref_mnt + '/' + region fasta = ref_mnt + '/' + fasta rf = open(region, 'r') cmd_list = [] for line in rf: chrom = line.split('\t') cmd = samtools + ' mpileup -r ' + chrom[0] + ' -B -f ' + fasta + ' ' + sample + \ '.merged.final.bam | java -Xmx4000m -jar ' + varscan + ' mpileup2cns --output-vcf 1 --min-var-freq 0.35 --variants 1 > ' + \ chrom[0] + '.vcf' cmd_list.append(cmd) rf.close() proc = int(th) - 2 job_manager(cmd_list, str(proc))
def varscan_germline(config_file, sample, ref_mnt): (samtools, varscan, region, fasta, th) = parse_config(config_file) region = ref_mnt + '/' + region fasta = ref_mnt + '/' + fasta rf = open(region, 'r') cmd_list = [] for line in rf: chrom = line.split('\t') cmd = samtools + ' mpileup -r ' + chrom[0] + ' -B -f ' + fasta + ' ' + sample + \ '.merged.final.bam | java -Xmx4000m -jar ' + varscan + ' mpileup2cns --output-vcf 1 --min-var-freq 0.35' \ ' --variants 1 > ' + \ chrom[0] + '.vcf' cmd_list.append(cmd) rf.close() proc = int(th) - 2 job_manager(cmd_list, str(proc))
def calc_coverage(bedtools2_tool, sample, bedfile, cont, obj): src_cmd = '. /home/ubuntu/.novarc;' job_list = [] for bnid in open(sample): bnid = bnid.rstrip('\n') (dl_cmd, bam, bai) = get_bam_name(bnid, src_cmd, cont, obj) if isinstance(bam, str): bed_cmd = bedtools2_tool + ' coverage -hist -abam ' + bam + ' -b ' + bedfile + ' > ' + bnid + '.hist;' cleanup = 'rm ' + bam + ' ' + bai + ';' final = dl_cmd + bed_cmd + cleanup job_list.append(final) else: for i in range(len(bam)): bed_cmd = bedtools2_tool + ' coverage -hist -abam ' + bam[i] + ' -b ' + bedfile + ' > ' + bnid \ + '_' + str(i) + '.hist;' cleanup = 'rm ' + bam[i] + ' ' + bai[i] + ';' final = dl_cmd[i] + bed_cmd + cleanup job_list.append(final) job_manager(job_list, '8')
def filter_bam_pipe(config_file, lane, ref_mnt): (th, cont, obj, mouse_filter) = parse_config(config_file) job_list = [] src_cmd = ". /home/ubuntu/.novarc;" # pdb.set_trace() fh = open(lane, 'r') for la in fh: la = la.rstrip('\n') info = la.split('\t') lanes = info[2].split(', ') for rg in lanes: fn = obj + '/' + info[0] + '/BAM/' + info[0] + '_' + rg + '.bam' stub = info[0] + '_' + rg swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + fn + " >> dl.log 2>> dl.log" mf = mouse_filter + ' -b ' + fn + ' -o ' + stub cmd = swift_cmd + '; ' + mf + '; rm ' + fn + ';' job_list.append(cmd) fh.close() job_manager(job_list, th)
def cnv_pipe(config_file, tum_bam, norm_bam, o_flag, project2): (project_dir, project, bedtools, ana, bed, cores) = parse_config(config_file) job_list = [] tum_id = re.match('(\d+-\d+)\.', os.path.basename(tum_bam)) tum_id = tum_id.group(1) norm_id = re.match('(\d+-\d+)\.', os.path.basename(norm_bam)) norm_id = norm_id.group(1) pair = tum_id + '_' + norm_id bed_t1 = bed.replace('.bed', '_t1.bed') bed_t2 = bed.replace('.bed', '_t2.bed') t1_genes = get_genes(bed_t1) t2_genes = get_genes(bed_t2) t1_suffix = '.t1.bedtools.coverage.txt' t2_suffix = '.t2.bedtools.coverage.txt' cnv_dir = project_dir + project + '/' + ana + '/' + pair + '/OUTPUT/' if not os.path.isdir(cnv_dir): sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Trying with backup project ' + project2 + '\n') cnv_dir = project_dir + project2 + '/' + ana + '/' + pair + '/OUTPUT/' if not os.path.isdir(cnv_dir): sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Check config and try again!\n') exit(1) clist = (tum_bam + ' -b ' + bed_t1 + ' > ', tum_bam + ' -b ' + bed_t2 + ' > ', norm_bam + ' -b ' + bed_t1 + ' > ', norm_bam + ' -b ' + bed_t2 + ' > ') flist = (cnv_dir + tum_id + t1_suffix, cnv_dir + tum_id + t2_suffix, cnv_dir + norm_id + t1_suffix, cnv_dir + norm_id + t2_suffix) if o_flag == 'y': sys.stderr.write(date_time() + 'Overwrite yes given, creating new coverage files\n') for i in range(0, len(flist)): job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i]) else: sys.stderr.write(date_time() + 'Overwrite no given, checking for existing coverage files first\n') for i in range(0, len(flist)): if not os.path.isfile(flist[i]): job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i]) sys.stderr.write(date_time() + 'Calculating read depth for ' + pair + '\n') job_manager(job_list, cores) # process coverage files, assess cnv sys.stderr.write(date_time() + 'Collapsing read counts in to tiers and gene\n') calc_tn_cov_ratios(cnv_dir, tum_id, norm_id, t1_genes, t2_genes, t1_suffix, t2_suffix) sys.stderr.write(date_time() + 'CNV analysis complete for ' + pair + '\n') return 0
def batch_qc(fn, cont, obj, t): if len(sys.argv) == 1: parser.print_help() sys.exit(1) inputs = parser.parse_args() fh = open(inputs.fn, 'r') src_cmd = '. ~/.novarc;' jobs = [] for line in fh: line = line.rstrip('\n') # All files for current bnid to be stored in cwd swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line sys.stderr.write(date_time() + 'Checking for sequence files for sample ' + line + '\n' + swift_cmd + '\n') try: contents = subprocess.check_output(swift_cmd, shell=True) if len(contents) < len(line): sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue except: sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents) sf1 = seqfile[0] end1 = os.path.basename(sf1) sf2 = seqfile[1] end2 = os.path.basename(sf2) swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2 fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2 upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2 jobs.append(';'.join( [swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd])) sys.stderr.write(date_time() + 'Job list created, running jobs!\n') job_manager(jobs, t) return 0
def fastqc_pipe(flist, config_file): (cont, obj, fastqc_tool, threads) = parse_config(config_file) src_cmd = '. /home/ubuntu/.novarc;' job_list = [] for fq in open(flist): fq = fq.rstrip('\n') root = os.path.basename(fq).replace('_sequence.txt.gz', '') parts = fq.split('/') dl_cmd = src_cmd + 'swift download ' + cont + ' ' + fq + ';' outdir = obj + '/' + parts[1] + '/QC/' logdir = obj + '/' + parts[1] + '/LOGS/' setup_cmd = 'mkdir -p ' + outdir + ' ' + logdir + ';' logfile = logdir + root + '.fastqc.log' fastqc_cmd = fastqc_tool + ' --extract -o ' + outdir + ' ' + fq + ' 2> ' + logfile + ';' up_cmd = src_cmd + 'swift upload ' + cont + ' ' + logfile + ';' up_cmd += 'find ' + outdir + ' -name ' + root + '* | xargs -IFN swift upload ' + cont + ' FN;' cleanup = 'rm ' + fq + ';' final_cmd = dl_cmd + setup_cmd + fastqc_cmd + up_cmd + cleanup job_list.append(final_cmd) job_manager(job_list, threads)
def snpeff_pipe(config_file, sample_list, ref_mnt, novarc): (java, snpeff, snpsift, report, dbsnp, bed, cont, obj, max_t) = parse_config(config_file) dbsnp = ref_mnt + '/' + dbsnp bed = ref_mnt + '/' + bed fh = open(sample_list) mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) cmd_list = [] run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 -interval ' + bed source_novarc(novarc) for line in fh: line = line.rstrip('\n') in_vcf = obj + '/' + line + '.merged.final.bam.germline_calls.vcf' sift_vcf = obj + '/' + line + '.snpSift.vcf' final_vcf = obj + '/' + line + '.snpSift.snpEff.vcf' dl_vcf = 'swift download ' + cont + ' ' + in_vcf + ';' log = 'LOGS/' + line + '.snpeff.log' run_cmd = dl_vcf + run_snpsift + ' ' + in_vcf + ' > ' + sift_vcf + ' 2> ' + log + ';' + run_snpeff + ' ' \ + sift_vcf + ' -v > ' + final_vcf + ' 2>> ' + log cmd_list.append(run_cmd) job_manager(cmd_list, max_t)
def capture_coverage(bedtools2_tool, sample, capture_bed_ref, wait_flag): prefix = capture_bed_ref[:-4] cc_t1_cmd = bedtools2_tool + " coverage -hist -abam " + sample + ".rmdup.srt.bam -b " + prefix + '_t1.bed' \ + " | grep all > " + sample + ".capture_t1.hist" cc_t2_cmd = bedtools2_tool + " coverage -hist -abam " + sample + ".rmdup.srt.bam -b " + prefix + '_t2.bed' \ + " | grep all > " + sample + ".capture_t2.hist" sys.stderr.write(date_time() + cc_t1_cmd + "\n" + cc_t2_cmd + "\n") if wait_flag == 0: Popen(cc_t1_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True) Popen(cc_t2_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True) else: jobs = [cc_t1_cmd, cc_t2_cmd] job_manager(jobs, 2) return 0
def list_bam(cont, obj, sample, threads): list_cmd = '. /home/ubuntu/.novarc;swift list ' + cont + ' --prefix ' + obj + '/' + sample + '/' sys.stderr.write(date_time() + list_cmd + '\nGetting BAM list\n') flist = subprocess.check_output(list_cmd, shell=True) # Use to check on download status p = [] for fn in re.findall('(.*)\n', flist): if re.match('.*.merged.final.ba', fn): sys.stderr.write(date_time() + 'Downloading relevant BAM file ' + fn + '\n') dl_cmd = '. /home/ubuntu/.novarc;swift download ' + cont + ' --skip-identical ' + fn + ' --output '\ + os.path.basename(fn) p.append(dl_cmd) if len(p) < 1: sys.stderr.write(date_time() + 'No merged bam found for ' + sample + '\n') return 1 f = job_manager(p, threads) if f == 0: sys.stderr.write(date_time() + 'BAM download complete\n') return 0 else: sys.stderr.write(date_time() + 'BAM download failed\n') exit(1)
def mutect_pipe(config_file, tumor_id, normal_id): (java, mutect, intervals, fa_ordered, max_t, ram, project_dir, project, align) = parse_config(config_file) # break up intervals into max threads junks to run all in parallel int_fh = open(intervals, 'r') int_dict = {} i = 0 # create temp directory tmp_cmd = 'mkdir temp' subprocess.call(tmp_cmd, shell=True) # create sub-interval files - split by chromosome mk_dir_bed = 'mkdir bed' subprocess.call(mk_dir_bed, shell=True) for interval in int_fh: (chrom, start, end) = interval.split('\t') try: int_dict[chrom]['fh'].write(interval) except: int_dict[chrom] = {} int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed' int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w') int_dict[chrom]['fh'].write(interval) i += 1 job_ram = int(int(ram) / int(max_t)) run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str( job_ram) + 'g -jar ' + mutect # array will store commands to run, next def will take care of job management using popen cmd_list = [] bam_dir = project_dir + project + '/' + align tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam' normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam' sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n') out = tumor_id + '_' + normal_id # make result directory for current pair i = 1 for intvl in sorted(int_dict): int_dict[intvl]['fh'].close() cur = run_mut output_file = out + '.' + intvl + '.out' vcf_file = out + '.' + intvl + '.vcf' log_file = 'LOGS/' + out + '.mut.' + intvl + '.log' cur = cur + ' -T MuTect -fixMisencodedQuals -R ' + fa_ordered + ' --intervals ' + int_dict[intvl][ 'fn'] + ' --input_file:normal ' + normal_bam + ' --input_file:tumor ' + tumor_bam + \ ' --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 ' \ '--max_alt_allele_in_normal_fraction 0.05 --out ' + output_file + ' -vcf ' + vcf_file \ + ' --enable_extended_output --strand_artifact_power_threshold 0 -log ' + log_file \ + ' >> ' + log_file + ' 2>> ' + log_file + '; cat ' + output_file \ + ' | grep -v REJECT > ' + output_file + '.keep; cat ' + vcf_file \ + ' | grep -v REJECT > ' + vcf_file + '.keep ' cmd_list.append(cur) i += 1 # fix encode flag won't work if already phred 33, if a job fails try without try: job_manager(cmd_list, max_t) except: for i in range(0, len(cmd_list), 1): cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '') job_manager(cmd_list, max_t) cleanup_temp_dirs = 'rm -rf temp bed' sys.stderr.write('Cleaning up temp dirs ' + cleanup_temp_dirs + '\n') subprocess.call(cleanup_temp_dirs, shell=True) sys.stderr.write(date_time() + 'SNV calling completed for ' + out + '\n') return 0
def mutect_pipe(config_file, sample_pairs, ref_mnt): (java, gatk, intervals, fa_ordered, max_t, ram) = parse_config(config_file) intervals = ref_mnt + '/' + intervals # break up intervals into max threads junks to run all in parallel int_fh = open(intervals, 'r') int_dict = {} i = 0 # create temp directory tmp_cmd = 'mkdir temp' subprocess.call(tmp_cmd, shell=True) # create sub-interval files - split by chromosome mk_dir_bed = 'mkdir bed' subprocess.call(mk_dir_bed, shell=True) for interval in int_fh: (chrom, start, end) = interval.split('\t') intvl = start + '-' + end # normally not need if using normal interval file try: int_dict[chrom]['fh'].write(interval) except: int_dict[chrom] = {} int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed' int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w') int_dict[chrom]['fh'].write(interval) i += 1 fa_ordered = ref_mnt + '/' + fa_ordered fh = open(sample_pairs) job_ram = (int(ram) / int(max_t)) run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(job_ram) + 'g -jar ' + gatk mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) for line in fh: # array will store commands to run, next def will take care of job management using popen cmd_list = [] line = line.rstrip('\n') (sample, tumor_id, normal_id) = line.split('\t') tumor_bam = tumor_id + '.merged.final.bam' normal_bam = normal_id + '.merged.final.bam' sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n') out = tumor_id + '_' + normal_id # make result directory for current pair mk_res = 'mkdir ' + out subprocess.call(mk_res, shell=True) i = 1 for intvl in sorted(int_dict): int_dict[intvl]['fh'].close() cur = run_mut vcf_file = out + '.' + intvl + '.vcf' log_file = 'LOGS/' + out + '.mut.' + intvl + '.log' cur = cur + ' -T MuTect2 -S LENIENT -R ' + fa_ordered + ' --intervals ' + int_dict[intvl]['fn'] + \ ' -I:normal ' + normal_bam + ' -I:tumor ' + tumor_bam + ' --max_alt_alleles_in_normal_count 1000'\ ' --max_alt_alleles_in_normal_qscore_sum 37 --max_alt_allele_in_normal_fraction 0.05 --out ' + out \ + '/' + vcf_file + ' 2>> ' + log_file + ';' cmd_list.append(cur) i += 1 # fix encode flag won't work if alread phred 33, if a job fails try without try: job_manager(cmd_list, max_t) except: for i in range(0, len(cmd_list), 1): cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '') job_manager(cmd_list, max_t) sys.stderr.write(date_time() + 'Variant calling completed!\n') return 0
def calc_pos_cov(table, samtools, out): fh = open(table) head = next(fh) bnids = [] head = head.rstrip('\n').split('\t') for i in range(1, len(head), 1): bnid = head[i].split('_') bnids.append(bnid[0]) # create bed file to get coverage bed_fn = out + '.bed' bed = open(bed_fn, 'w') vlist = [] # in the event an indel happens in one sample and snv in another at same position, don't process twice for line in fh: vlist.append(create_bed(line, bed)) bed.close() fh.close() job_list = [] src_cmd = '. ~/.novarc;' # get bams, then build jobs for i in range(0, len(bnids), 1): sys.stderr.write(date_time() + 'Getting bam for ' + bnids[i] + '\n') bam = 'ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.bam' dl_cmd = src_cmd + 'swift download PDX --prefix ALIGN/' + bnids[ i] + '/BAM/' + bnids[i] + '.merged.final.ba;' subprocess.call(dl_cmd, shell=True) # try pdx container, if not, try pancan if os.path.isfile(bam): job_list.append(build_jobs(samtools, bed_fn, bnids[i])) else: sys.stderr.write(date_time() + dl_cmd + '\nBam for sample ' + bnids[i] + ' not in PDX contaner, ' 'trying PANCAN\n') dl_cmd = src_cmd + 'swift download PANCAN --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] \ + '.merged.final.ba;' subprocess.call(dl_cmd, shell=True) if os.path.isfile(bam): job_list.append(build_jobs(samtools, bed_fn, bnids[i])) else: sys.stderr.write(date_time() + dl_cmd + '\nCould not find bam for ' + bnids[i] + '\n') exit(1) sys.stderr.write('Running depth jobs\n') job_manager(job_list, '8') sys.stderr.write(date_time() + 'Compiling results\n') cov_dict = compile_results(bnids) sys.stderr.write(date_time() + 'Writing to output table\n') out_fh = open(out + '_variant_coverage_table.txt', 'w') out_fh.write('\t'.join(head) + '\n') for var in vlist: out_fh.write(var) for i in range(0, len(bnids), 1): m = re.search('\S+-(chr\w+)_(\d+)_\w+->\w+', var) (chrom, pos) = (m.group(1), m.group(2)) if bnids[i] in cov_dict[chrom][pos]: out_fh.write('\t' + cov_dict[chrom][pos][bnids[i]]) else: out_fh.write('\t0') out_fh.write('\n') out_fh.close() sys.stderr.write(date_time() + 'Fin\n')
def mutect_pipe(config_file, sample_pairs, ref_mnt): (java, gatk, intervals, fa_ordered, max_t, ram) = parse_config(config_file) intervals = ref_mnt + '/' + intervals # break up intervals into max threads junks to run all in parallel int_fh = open(intervals, 'r') int_dict = {} i = 0 # create temp directory tmp_cmd = 'mkdir temp' subprocess.call(tmp_cmd, shell=True) # create sub-interval files - split by chromosome mk_dir_bed = 'mkdir bed' subprocess.call(mk_dir_bed, shell=True) for interval in int_fh: (chrom, start, end) = interval.split('\t') intvl = start + '-' + end # normally not need if using normal interval file try: int_dict[chrom]['fh'].write(interval) except: int_dict[chrom] = {} int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed' int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w') int_dict[chrom]['fh'].write(interval) i += 1 fa_ordered = ref_mnt + '/' + fa_ordered fh = open(sample_pairs) job_ram = (int(ram) / int(max_t)) run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str( job_ram) + 'g -jar ' + gatk mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) for line in fh: # array will store commands to run, next def will take care of job management using popen cmd_list = [] line = line.rstrip('\n') (sample, tumor_id, normal_id) = line.split('\t') tumor_bam = tumor_id + '.merged.final.bam' normal_bam = normal_id + '.merged.final.bam' sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n') out = tumor_id + '_' + normal_id # make result directory for current pair mk_res = 'mkdir ' + out subprocess.call(mk_res, shell=True) i = 1 for intvl in sorted(int_dict): int_dict[intvl]['fh'].close() cur = run_mut vcf_file = out + '.' + intvl + '.vcf' log_file = 'LOGS/' + out + '.mut.' + intvl + '.log' cur = cur + ' -T MuTect2 -S LENIENT -R ' + fa_ordered + ' --intervals ' + int_dict[intvl]['fn'] + \ ' -I:normal ' + normal_bam + ' -I:tumor ' + tumor_bam + ' --max_alt_alleles_in_normal_count 1000'\ ' --max_alt_alleles_in_normal_qscore_sum 37 --max_alt_allele_in_normal_fraction 0.05 --out ' + out \ + '/' + vcf_file + ' 2>> ' + log_file + ';' cmd_list.append(cur) i += 1 # fix encode flag won't work if alread phred 33, if a job fails try without try: job_manager(cmd_list, max_t) except: for i in range(0, len(cmd_list), 1): cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '') job_manager(cmd_list, max_t) sys.stderr.write(date_time() + 'Variant calling completed!\n') return 0
def mutect_pipe(config_file, tumor_id, normal_id): (java, mutect, intervals, fa_ordered, max_t, ram, project_dir, project, align) = parse_config(config_file) # break up intervals into max threads junks to run all in parallel int_fh = open(intervals, 'r') int_dict = {} i = 0 # create temp directory tmp_cmd = 'mkdir temp' subprocess.call(tmp_cmd, shell=True) # create sub-interval files - split by chromosome mk_dir_bed = 'mkdir bed' subprocess.call(mk_dir_bed, shell=True) for interval in int_fh: (chrom, start, end) = interval.split('\t') try: int_dict[chrom]['fh'].write(interval) except: int_dict[chrom] = {} int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed' int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w') int_dict[chrom]['fh'].write(interval) i += 1 job_ram = int(int(ram) / int(max_t)) run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(job_ram) + 'g -jar ' + mutect # array will store commands to run, next def will take care of job management using popen cmd_list = [] bam_dir = project_dir + project + '/' + align tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam' normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam' sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n') out = tumor_id + '_' + normal_id # make result directory for current pair i = 1 for intvl in sorted(int_dict): int_dict[intvl]['fh'].close() cur = run_mut output_file = out + '.' + intvl + '.out' vcf_file = out + '.' + intvl + '.vcf' log_file = 'LOGS/' + out + '.mut.' + intvl + '.log' cur = cur + ' -T MuTect -fixMisencodedQuals -R ' + fa_ordered + ' --intervals ' + int_dict[intvl][ 'fn'] + ' --input_file:normal ' + normal_bam + ' --input_file:tumor ' + tumor_bam + \ ' --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 ' \ '--max_alt_allele_in_normal_fraction 0.05 --out ' + output_file + ' -vcf ' + vcf_file \ + ' --enable_extended_output --strand_artifact_power_threshold 0 -log ' + log_file \ + ' >> ' + log_file + ' 2>> ' + log_file + '; cat ' + output_file \ + ' | grep -v REJECT > ' + output_file + '.keep; cat ' + vcf_file \ + ' | grep -v REJECT > ' + vcf_file + '.keep ' cmd_list.append(cur) i += 1 # fix encode flag won't work if already phred 33, if a job fails try without try: job_manager(cmd_list, max_t) except: for i in range(0, len(cmd_list), 1): cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '') job_manager(cmd_list, max_t) cleanup_temp_dirs = 'rm -rf temp bed' sys.stderr.write('Cleaning up temp dirs ' + cleanup_temp_dirs + '\n') subprocess.call(cleanup_temp_dirs, shell=True) sys.stderr.write(date_time() + 'SNV calling completed for ' + out + '\n') return 0
def mutect_pipe(config_file, sample_pairs, ref_mnt): (java, mutect, intervals, fa_ordered, max_t, ram) = parse_config(config_file) intervals = ref_mnt + "/" + intervals # break up intervals into max threads junks to run all in parallel int_fh = open(intervals, "r") int_dict = {} i = 0 # create temp directory tmp_cmd = "mkdir temp" subprocess.call(tmp_cmd, shell=True) # create sub-interval files - split by chromosome mk_dir_bed = "mkdir bed" subprocess.call(mk_dir_bed, shell=True) for interval in int_fh: (chrom, start, end) = interval.split("\t") intvl = start + "-" + end # normally not need if using normal interval file try: int_dict[chrom]["fh"].write(interval) except: int_dict[chrom] = {} int_dict[chrom]["fn"] = "bed/intervals_" + chrom + ".bed" int_dict[chrom]["fh"] = open(int_dict[chrom]["fn"], "w") int_dict[chrom]["fh"].write(interval) i += 1 fa_ordered = ref_mnt + "/" + fa_ordered fh = open(sample_pairs) job_ram = int(ram) / int(max_t) run_mut = java + " -Djava.io.tmpdir=./temp -Xmx" + str(job_ram) + "g -jar " + mutect mk_log_dir = "mkdir LOGS" subprocess.call(mk_log_dir, shell=True) for line in fh: # array will store commands to run, next def will take care of job management using popen cmd_list = [] line = line.rstrip("\n") (sample, tumor_id, normal_id) = line.split("\t") tumor_bam = tumor_id + ".merged.final.bam" normal_bam = normal_id + ".merged.final.bam" sys.stderr.write(date_time() + "Processing pair T: " + tumor_bam + " N: " + normal_bam + "\n") out = tumor_id + "_" + normal_id # make result directory for current pair mk_res = "mkdir " + out subprocess.call(mk_res, shell=True) i = 1 for intvl in sorted(int_dict): int_dict[intvl]["fh"].close() cur = run_mut output_file = out + "." + intvl + ".out" vcf_file = out + "." + intvl + ".vcf" log_file = "LOGS/" + out + ".mut." + intvl + ".log" cur = ( cur + " -T MuTect -fixMisencodedQuals -R " + fa_ordered + " --intervals " + int_dict[intvl]["fn"] + " --input_file:normal " + normal_bam + " --input_file:tumor " + tumor_bam + " --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 " "--max_alt_allele_in_normal_fraction 0.05 --out " + out + "/" + output_file + " -vcf " + out + "/" + vcf_file + " --enable_extended_output --strand_artifact_power_threshold 0 -log " + log_file + " >> " + log_file + " 2>> " + log_file + "; cat " + out + "/" + output_file + " | grep -v REJECT > " + out + "/" + output_file + ".keep; cat " + out + "/" + vcf_file + " | grep -v REJECT > " + out + "/" + vcf_file + ".keep " ) cmd_list.append(cur) i += 1 # fix encode flag won't work if alread phred 33, if a job fails try without try: job_manager(cmd_list, max_t) except: for i in xrange(0, len(cmd_list), 1): cmd_list[i] = cmd_list[i].replace("-fixMisencodedQuals ", "") job_manager(cmd_list, max_t) sys.stderr.write(date_time() + "Variant calling completed!\n") return 0