def calc_coverage(sample_list, suffix, bed): slist = [] sys.stderr.write(date_time() + 'Processing bed file ' + bed + '\n') (gene_list, gene_dict, master_dict) = process_bed(bed) for sample in open(sample_list): sys.stderr.write(date_time() + 'Processing sample ' + sample) sample = sample.rstrip('\n') slist.append(sample) cur = sample + suffix temp_dict = copy.deepcopy(gene_dict) for entry in open(cur): info = entry.rstrip('\n').split('\t') if info[0] == 'all': break parts = info[3].split('_') temp_dict[parts[0]]['tot_cov'] += (int(info[4]) * int(info[5])) for gene in gene_list: master_dict[gene][sample] = (float(temp_dict[gene]['tot_cov'])/temp_dict[gene]['len']) sys.stderr.write(date_time() + 'Outputting results\n') sys.stdout.write('Gene/Sample\t') print '\t'.join(slist) for gene in gene_list: sys.stdout.write(gene) for sample in slist: sys.stdout.write('\t' + str(master_dict[gene][sample])) print sys.stderr.write(date_time() + 'Fin!\n')
def preprocess_bams(config_file, sample_pairs): # create sample list sample_list = 'sample_list.txt' fh = open(sample_pairs, 'r') sl = open(sample_list, 'w') temp = {} for line in fh: cur = line.rstrip('\n').split('\t') if len(cur) == 3: if cur[1] not in temp: sl.write(cur[1] + '\n') temp[cur[1]] = 1 if cur[2] not in temp: sl.write(cur[2] + '\n') temp[cur[2]] = 1 else: if cur[0] not in temp: sl.write(cur[0] + '\n') temp[cur[0]] = 1 sl.close() fh .close() miss_list = check_for_merged_bams(config_file, sample_list) if len(miss_list) > 0: sys.stderr.write(date_time() + 'Missing files detected, merging lane files\n') temp_fn = 'temp_samp_list.txt' temp_fh = open(temp_fn, 'w') temp_fh.write('\n'.join(miss_list)) temp_fh.close() run_novosort(config_file, temp_fn) else: sys.stderr.write(date_time() + 'All bams found. Ready for next step!\n')
def platypus_germline(config_file, sample, log_dir, cflag): loc = log_dir + sample + ".platypus.log" # here for safety as python is confusing about whether variables exist outside of if-else statements or not platypus_cmd = '' if cflag == 'y': (platypus, fasta, threads, project_dir, project, align) = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \ + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1" else: (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \ = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' if not (os.path.isfile(bam + '.bai') or os.path.isfile(bam[:-1] + 'i')): log(loc, date_time() + bam + ' not indexed. Indexing\n') cmd = samtools + ' index ' + bam log(loc, date_time() + cmd + '\n') subprocess.call(cmd, shell=True) platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \ + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \ + " --logFileName=" + loc + " >> " + loc + " 2>&1" log(loc, date_time() + platypus_cmd + "\n") f = 0 try: f = subprocess.call(platypus_cmd, shell=True) except: log(loc, 'platypus germline variant calling failed for sample ' + sample + '\n') return f return 0
def download_from_swift(cont, obj, lane_list): src_cmd = ". /home/ubuntu/.novarc;" lanes = open(lane_list, 'r') head = '' print 'BID\tread group\ttotal starting read pairs(rp)\t% r1 w/ adapter\t% r2 w/ adapter\trp too short\t% rp passed' \ '\ttotal starting base pairs(bp)\tread1 bp trimmed\tread2 bp trimmed\t% bp written' for line in lanes: line = line.rstrip('\n') (bid, seqtype, lane_csv) = line.split('\t') for lane in lane_csv.split(', '): cur = obj + '/' + bid + '/LOGS/' + bid + '_' + lane + '.cutadapt.log' swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + cur sys.stderr.write(date_time() + swift_cmd + "\n") try: check = check_output(swift_cmd, shell=True, stderr=subprocess.PIPE) except: sys.stderr.write(date_time() + "Download of " + obj + " from " + cont + " failed\n") exit(1) temp = parseCUTADAPT(cur) print bid + '\t' + lane + '\t' + '\t'.join(temp) lanes.close() sys.stdout.write(head) return 0
def calc_coverage(sample_list, suffix, bed): slist = [] sys.stderr.write(date_time() + 'Processing bed file ' + bed + '\n') (gene_list, gene_dict, master_dict) = process_bed(bed) for sample in open(sample_list): sys.stderr.write(date_time() + 'Processing sample ' + sample) sample = sample.rstrip('\n') slist.append(sample) cur = sample + suffix temp_dict = copy.deepcopy(gene_dict) for entry in open(cur): info = entry.rstrip('\n').split('\t') if info[0] == 'all': break parts = info[3].split('_') temp_dict[parts[0]]['tot_cov'] += (int(info[4]) * int(info[5])) for gene in gene_list: master_dict[gene][sample] = (float(temp_dict[gene]['tot_cov']) / temp_dict[gene]['len']) sys.stderr.write(date_time() + 'Outputting results\n') sys.stdout.write('Gene/Sample\t') print('\t'.join(slist)) for gene in gene_list: sys.stdout.write(gene) for sample in slist: sys.stdout.write('\t' + str(master_dict[gene][sample])) print() sys.stderr.write(date_time() + 'Fin!\n')
def fastqc(fastqc_tool, sample, end1, end2, t): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.fastqc.log' fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 log(loc, date_time() + fastqc_cmd + "\n") f = Popen(fastqc_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True) # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score # didn't fit call('sleep 20s', shell=True) if str(f.poll()) == '1': log( loc, date_time() + 'fastqc returned an error. Check your inputs and try again!\n') exit(1) return 0
def lane_express_quant(bams, config_file): (stranded, strand, express, express_sl, transcriptome) = parse_config(config_file) for bam in open(bams): bam = bam.rstrip('\n') bam_dir = os.path.dirname(bam) root = os.path.basename(re.sub('.Aligned.toTranscriptome.out.*', '', bam)) qc_dir = bam_dir.replace('BAMS', 'QC') qc_file = qc_dir + '/' + root + '.qc_stats.json' qc_data = json.loads(open(qc_file, 'r').read()) (x, s) = (str(int(round(float(qc_data['picard_stats']['x_ins_size'])))), str(int(round(float(qc_data['picard_stats']['s_ins_size']))))) wd = qc_dir + '/' + root + '/' loc = wd + root + '.log' express_cmd = 'mkdir ' + wd + ';' call(express_cmd, shell=True) sys.stderr.write(date_time() + 'Created dir ' + wd + ' to quantify ' + bam + '\n' + express_cmd + '\n') if stranded == 'N': express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -o ' + wd + ' -m '\ + x + ' -s ' + s + ' --logtostderr 2>> ' + loc + ';' else: express_cmd = 'sbatch -c 4 --export=express="' + express + '",transcriptome="' + transcriptome + '",bam="' \ + bam + '",wd="' + wd + '",strand="' + strand + '",x="' + x + '",s="' + s + '",loc="' + loc \ + '",root="' + root + '" ' + express_sl # express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -o ' + wd + ' --'\ # + strand + ' -m ' + x + ' -s ' + s + ' --logtostderr 2>> ' + loc + ';' # express_cmd += 'mv ' + wd + 'results.xprs ' + wd + root + '.express_quantification.txt; mv ' + wd \ # + 'params.xprs ' + wd + root + '.params.xprs;' sys.stderr.write(date_time() + 'Submitting quantification job\n' + express_cmd + '\n') call(express_cmd, shell=True) return 0
def gen_report(vcf): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in xrange(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n') for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def downsample_bam(samtools, bam, frac, out_dir, th): out_root = os.path.basename(bam.replace('.bam', '')) cmd = 'sbatch -c ' + th + ' ' + samtools + ' view --threads ' + th + ' -b ' + bam + ' -s ' + frac + ' > ' \ + out_dir + '/' + out_root + '_subsample_' + frac + '.bam' sys.stderr.write(date_time() + 'Downsampling ' + bam + '\n' + cmd + '\n') subprocess.call(cmd, shell=True) sys.stderr.write(date_time() + 'process complete!\n')
def novosort_merge_pe(config_file, sample_list): fh = open(sample_list, 'r') (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, novo_merge_rmdup_slurm) \ = parse_config(config_file) for sample in fh: sample = sample.rstrip('\n') loc = '../LOGS/' + sample + '.novosort_merge.log' job_loc = sample + '.novosort_merge.log' (bam_list, n) = list_bam(project, align, sample) bam_string = " ".join(bam_list) cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAMS/' os.chdir(cur_dir) out_bam = sample + '.merged.transcriptome.bam' if n > 1: batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' --export=novosort="' \ + novosort + '",threads="' + threads + '",ram="' + ram + 'G",out_bam="' + out_bam \ + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' + novo_merge_rmdup_slurm log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n") subprocess.call(batch, shell=True) else: link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.transcriptome.bam;' log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n' + link_bam + '\n') subprocess.call(link_bam, shell=True) sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n') return 0
def preprocess_bams(config_file, sample_pairs): # create sample list sample_list = 'sample_list.txt' fh = open(sample_pairs, 'r') sl = open(sample_list, 'w') temp = {} for line in fh: cur = line.rstrip('\n').split('\t') if len(cur) == 3: if cur[1] not in temp: sl.write(cur[1] + '\n') temp[cur[1]] = 1 if cur[2] not in temp: sl.write(cur[2] + '\n') temp[cur[2]] = 1 else: if cur[0] not in temp: sl.write(cur[0] + '\n') temp[cur[0]] = 1 sl.close() fh.close() miss_list = check_for_merged_bams(config_file, sample_list) if len(miss_list) > 0: sys.stderr.write(date_time() + 'Missing files detected, merging lane files\n') temp_fn = 'temp_samp_list.txt' temp_fh = open(temp_fn, 'w') temp_fh.write('\n'.join(miss_list)) temp_fh.close() run_novosort(config_file, temp_fn) else: sys.stderr.write(date_time() + 'All bams found. Ready for next step!\n')
def organize_dirs(self): # check for existing BAM, QC and LOG dirs one level up try: if not os.path.isdir('../' + self.bam_dir): mk_bam_dir = 'mkdir ../' + self.bam_dir log(self.loc, date_time() + 'Making BAM directory ' + mk_bam_dir + '\n') call(mk_bam_dir, shell=True) if not os.path.isdir('../' + self.qc_dir): mk_qc_dir = 'mkdir ../' + self.qc_dir log(self.loc, date_time() + 'Making QC directory ' + mk_qc_dir + '\n') call(mk_qc_dir, shell=True) if not os.path.isdir('../' + self.log_dir): mk_log_dir = 'mkdir ../' + self.log_dir log(self.loc, date_time() + 'Making LOGS directory ' + mk_log_dir + '\n') call(mk_log_dir, shell=True) reloc_files = 'mv ' + self.bam_dir + '* ../' + self.bam_dir + '; mv ' + self.log_dir + '* ../' \ + self.log_dir + '; mv ' + self.qc_dir + '* ../' + self.qc_dir log(self.loc, date_time() + 'Relocating files ' + reloc_files + '\n') call(reloc_files, shell=True) # need to reassign log file location since it's being moved! self.loc = '../' + self.loc rm_old = 'rmdir ' + ' '.join((self.bam_dir , self.log_dir, self.qc_dir)) log(self.loc, date_time() + 'Clearing out working dirs ' + rm_old + '\n') call(rm_old, shell=True) return 0 except: return 1
def download_from_swift(cont, obj, lane_list): src_cmd = ". /home/ubuntu/.novarc;" lanes = open(lane_list, 'r') head = '' print( 'BID\tread group\ttotal starting read pairs(rp)\t% r1 w/ adapter\t% r2 w/ adapter\trp too short\t% rp passed' '\ttotal starting base pairs(bp)\tread1 bp trimmed\tread2 bp trimmed\t% bp written' ) for line in lanes: line = line.rstrip('\n') (bid, seqtype, lane_csv) = line.split('\t') for lane in lane_csv.split(', '): cur = obj + '/' + bid + '/LOGS/' + bid + '_' + lane + '.cutadapt.log' swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + cur sys.stderr.write(date_time() + swift_cmd + "\n") try: check = check_output(swift_cmd, shell=True, stderr=subprocess.PIPE).decode() except: sys.stderr.write(date_time() + "Download of " + obj + " from " + cont + " failed\n") exit(1) temp = parseCUTADAPT(cur) print(bid + '\t' + lane + '\t' + '\t'.join(temp)) lanes.close() sys.stdout.write(head) return 0
def batch_qc(fn, cont, obj, t): if len(sys.argv) == 1: parser.print_help() sys.exit(1) inputs = parser.parse_args() fh = open(inputs.fn, 'r') src_cmd = '. ~/.novarc;' jobs = [] for line in fh: line = line.rstrip('\n') # All files for current bnid to be stored in cwd swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line sys.stderr.write(date_time() + 'Checking for sequence files for sample ' + line + '\n' + swift_cmd + '\n') try: contents = subprocess.check_output(swift_cmd, shell=True) if len(contents) < len(line): sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue except: sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n') continue seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents) sf1 = seqfile[0] end1 = os.path.basename(sf1) sf2 = seqfile[1] end2 = os.path.basename(sf2) swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2 fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2 upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2 jobs.append(';'.join([swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd])) sys.stderr.write(date_time() + 'Job list created, running jobs!\n') job_manager(jobs, t) return 0
def gen_report(vcf, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') sample = parts[0] loc = 'LOGS/' + sample + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out_fn = sample + '.indels.vep.prioritized_impact.report.xls' out = open(out_fn, 'w') desired = { 'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0 } desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace( 'Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write( 'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n' ) if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN']] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out, ref_flag) out.close() log( loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def job_manager(cmd_list, max_t): x = len(cmd_list) # cur position in command list cur = 0 # completed comp = 0 # initialize process list p = {} sys.stderr.write(date_time() + 'Initializing run\n') n = int(max_t) if n > x: n = x for i in range(0, n, 1): p[i] = {} p[i]['job'] = subprocess.Popen(cmd_list[i], shell=True) p[i]['cmd'] = cmd_list[i] p[i]['status'] = 'Running' sys.stderr.write(cmd_list[i] + '\n') cur += 1 s = 0 j = 30 m = 30 while comp < x: if s % m == 0: sys.stderr.write(date_time() + 'Checking job statuses. ' + str(comp) + ' of ' + str(x) + ' completed. ' + str(s) + ' seconds have passed\n') for i in range(0, n, 1): check = p[i]['job'].poll() if str(check) == '1': sys.stderr.write(date_time() + 'Job returned an error while running ' + p[i]['cmd'] + ' aborting!\n') for k in range(0, n, 1): p[k]['job'].kill() sys.stderr.write('Killing job ' + str(k) + '\n') exit(1) if str(check) == '0' and p[i]['status'] != str(check): comp += 1 p[i]['status'] = str(check) if comp <= (x - n): try: p[i]['job'] = subprocess.Popen(cmd_list[cur], shell=True) p[i]['cmd'] = cmd_list[cur] p[i]['status'] = 'Running' cur += 1 except: sys.stderr.write(date_time() + "Tried to queue command " + p[i]['cmd'] + '\n was ' + str(cur) + ' in command list, ' + str(i) + ' in queue list\n') exit(1) s += j sleep_cmd = 'sleep ' + str(j) + 's' subprocess.call(sleep_cmd, shell=True) sys.stderr.write(date_time() + str(comp) + ' jobs completed\n') return 0
def annot_platypus(config_file, sample, skip): (vep_tool, vep_cache, plugin_dir, fasta, threads, java, cadd_snv, cadd_indel, tx_index, project_dir, project, analysis, annotation, user, group) = parse_config(config_file) src_env = '. /etc/environment' subprocess.call(src_env, shell=True) ana_dir = project_dir + project + '/' + analysis + '/' + sample if skip == 'n': pass_filter(ana_dir + '/' + sample) set_acls(ana_dir, user, group) in_vcf = ana_dir + '/' + sample + '.germline_pass.vcf' out_vcf = sample + '.germline.vep91.vcf' buffer_size = '5000' ann_dir = project_dir + project + '/' + annotation + '/' + sample if not os.path.isdir(ann_dir): mk_ann = 'mkdir -p ' + ann_dir sys.stderr.write('Creating annotation output directories ' + mk_ann + '\n') subprocess.call(mk_ann, shell=True) os.chdir(ann_dir) sys.stderr.write(date_time() + 'Changed to working directory ' + ann_dir + '\n') if int(threads) > 1: threads = str(int(threads) - 1) run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd_snv, cadd_indel, sample, buffer_size, plugin_dir) sys.stderr.write(date_time() + 'Annotating sample ' + in_vcf + ' ' + run_cmd + '\n') # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart check = subprocess.Popen(run_cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) check_run = watch_mem(check, sample) if check_run != 0: buffer_size = str(int(buffer_size) // 2) clean_up = 'rm \'' + out_vcf + '*\'' sys.stderr.write(date_time() + 'VEP failed. Status of run was ' + str(check_run) + ' Trying smaller buffer size of ' + buffer_size + '\n' + clean_up + '\n') try: os.killpg(os.getpgid(check.pid), signal.SIGINT) except: sys.stderr.write(date_time() + 'Killing process failed. Might have already died for other reasons...\n') subprocess.call(clean_up, shell=True) run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd_snv, cadd_indel, sample, buffer_size, plugin_dir) sys.stderr.write(date_time() + 'Annotating sample ' + sample + in_vcf + '\n') check = subprocess.call(run_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'VEP failed for sample ' + sample + '\n') exit(1) else: sys.stderr.write(date_time() + 'VEP annotation of ' + in_vcf + ' successful!\n') check = gen_report(out_vcf, sample, tx_index) if check == 0: sys.stderr.write(date_time() + 'Summary table of germline calls completed!\n') else: sys.stderr.write(date_time() + 'Summary table for ' + out_vcf + ' FAILED!\n') return 1 set_acls(ann_dir, user, group) sys.stderr.write(date_time() + 'VEP91 annotation of ' + sample + ' complete!\n') return 0
def vep(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source): from annotation.annot_vcf_vep import annot_vcf_vep_pipe check = annot_vcf_vep_pipe(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source) if check == 0: sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output successful.\n') else: sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output failed.\n') exit(1)
def run_novosort(config_file, sample_list): check = novosort_merge_pe(config_file, sample_list) if check == 0: sys.stderr.write(date_time() + 'File merge complete!\n') else: sys.stderr.write(date_time() + 'File download and merge failed.\n') exit(1)
def novosort_merge_pe(config_file, sample_list): fh = open(sample_list, 'r') (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, rmdup, novo_merge_rmdup_slurm, novo_picard_merge_rmdup_slurm) = parse_config(config_file) for sample in fh: sample = sample.rstrip('\n') loc = sample + '.novosort_merge.log' (bam_list, bai_list, n) = list_bam(project, align, sample) bam_string = " ".join(bam_list) cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAM/' os.chdir(cur_dir) out_bam = sample + '.merged.final.bam' if n > 1: if rmdup == 'Y': job_loc = sample + '.novosort_merge.log' job_name = sample + '_novosort_merge' batch = 'sbatch -c ' + threads + ' -J ' + job_name + ' --mem ' + ram + 'G -o ' + job_loc \ + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \ + 'G",out_bam="' + out_bam + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' \ + novo_merge_rmdup_slurm log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n") subprocess.call(batch, shell=True) else: # run legacy pipe for removing dups using picard picard_tmp = 'picard_tmp' job_loc = sample + '.novosort_merge.picard_rmdup.log' job_name = sample + '_novosort_merge.picard_rmdup' # setting max records in ram to half of ram recs = str(int((int(ram) / 2) * (1000000000 / 200))) in_bam = sample + '.merged.bam' in_bai = sample + '.merged.bam.bai' mets = sample + '.rmdup.srt.metrics' batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' -J ' + job_name \ + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \ + 'G",in_bam="' + in_bam + '",bam_string="' + bam_string + '",loc="' + job_loc \ + '",java_tool="' + java_tool + '",picard_tool="' + picard_tool + '",tmp="' + picard_tmp \ + '",recs="' + recs + '",out_bam="' + out_bam + '",mets="' + mets + '",in_bai="' + in_bai \ + '" ' + novo_picard_merge_rmdup_slurm sys.stderr.write(date_time() + 'Merging with novosort and rmdup with picard for legacy reasons!\n' + batch + '\n') subprocess.call(batch, shell=True) else: link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.final.bam; ln -s ' + bai_list[0] + ' ' \ + sample + '.merged.final.bam.bai' log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n' + link_bam + '\n') subprocess.call(link_bam, shell=True) sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n') return 0
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem): meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[ 4] + "\tSM:" + meta[0] + "\tPL:illumina" loc = log_dir + sample + ".mmu.star.pe.log" mk_srt_tmp = 'mkdir TMP' subprocess.call(mk_srt_tmp, shell=True) # split threads for star and novosort as well as memory nmem = 2 ncpu = 2 threads = int(threads) sthreads = threads if threads >= 10: if threads == 10: sthreads = 6 ncpu = 4 else: if threads % 2.0 == 0.0: sthreads = int(threads / 2) ncpu = int(threads / 2) else: sthreads = int(math.ceil(threads / 2.0)) ncpu = int(math.floor(threads / 2.0)) else: sthreads = int(sthreads) - 2 mem = int(mem) if mem > 42: nmem = mem - 40 star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \ + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\ + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \ "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \ "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \ "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + " | " + novosort + " - -n -c " \ + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \ + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \ + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz" log(loc, date_time() + star_cmd + '\n') try: subprocess.call(star_cmd, shell=True) except: log( loc, date_time() + 'Star alignment and filter against against mouse genome failed\n') exit(1) log(loc, date_time() + 'Filtering completed, replacing fastq file\n') rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \ + ';rm -rf TMP' check = subprocess.call(rn_fq, shell=True) if check != 0: log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n') exit(1) return 0
def run_novosort(config_file, sample_list, obj): check = novosort_merge_pe(config_file, sample_list) if check == 0: sys.stderr.write(date_time() + 'File download and merge complete!\n') # rm unmerged bams, no longer needed rm_bam = 'rm -rf ' + obj call(rm_bam, shell=True) else: sys.stderr.write(date_time() + 'File download and merge failed.\n') exit(1)
def find_project_files(file_dir, file_prefix): find_cmd = "find " + file_dir + " -name \'" + file_prefix + '*\'' sys.stderr.write(date_time() + find_cmd + "\n") try: results = check_output(find_cmd, shell=True, stderr=subprocess.PIPE).decode() return results except: sys.stderr.write(date_time() + "Search of " + file_prefix + " from " + file_dir + " failed\n") exit(1) return 0
def watch_mem(proc_obj, sample, loc): from time import sleep while proc_obj.poll() is None: mem_pct = psutil.virtual_memory().percent log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + '\n') if mem_pct >= 99: log(loc, date_time() + 'Memory exceeded while running VEP.') return 1 sleep(30) return proc_obj.poll()
def watch_mem(proc_obj, sample): from time import sleep while proc_obj.poll() is None: mem_pct = psutil.virtual_memory().percent sys.stderr.write(date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + ' from platypus ' + '\n') if mem_pct >= 99: sys.stderr.write(date_time() + 'Memory exceeded while running VEP.') return 1 sleep(30) return proc_obj.poll()
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram): loc = log_dir + sample + ".picard.insert_size.log" picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \ + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \ + sample + ".insert_metrics.hist >> " + log_dir + sample + ".picard.insert_size.log 2>&1" log(loc , date_time() + picard_insert_size_cmd + "\n") try: call(picard_insert_size_cmd, shell=True) return 0 except: log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n') return 1
def vep(config_file, sample_pairs, in_suffix, out_suffix, in_mutect, source, vep_cache): if vep_cache == '84': from annotation.deprecated.annot_vcf_vep import annot_vcf_vep_pipe else: from annotation.annot_vcf_VEP91 import annot_vcf_vep_pipe check = annot_vcf_vep_pipe(config_file, sample_pairs, in_suffix, out_suffix, in_mutect, source) if check == 0: sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output successful.\n') else: sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output failed.\n') exit(1) return 0
def watch_mem(proc_obj, source, sample, loc): from time import sleep while proc_obj.poll() is None: mem_pct = psutil.virtual_memory().percent log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + ' from source ' + source + '\n') if mem_pct >= 99: log(loc, date_time() + 'Memory exceeded while running VEP.') return 1 sleep(30) return proc_obj.poll()
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram): loc = log_dir + sample + ".picard.insert_size.log" picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \ + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \ + sample + ".insert_metrics.hist >> " + log_dir + sample + ".picard.insert_size.log 2>&1" log(loc, date_time() + picard_insert_size_cmd + "\n") try: call(picard_insert_size_cmd, shell=True) return 0 except: log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n') return 1
def scalpel_indel(tumor_id, normal_id, log_dir, config_file): (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed, wg, project_dir, project, align) \ = parse_config(config_file) sample_pair = tumor_id + '_' + normal_id loc = log_dir + sample_pair + '.scalpel.log' bam_dir = project_dir + project + '/' + align tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam' normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam' if wg == 'n': scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \ + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc sys.stderr.write(date_time() + 'Starting indel calls for ' + sample_pair + '\n') log( loc, date_time() + 'Starting indel calls for ' + sample_pair + ' in capture mode with command:\n' + scalpel_cmd + '\n') check = call(scalpel_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'Indel calling failed for pair ' + sample_pair + ' with command:\n' + scalpel_cmd + '\n') exit(1) else: check = wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus, sample_pair, config_file) if check[0] != 0: sys.stderr.write('Scalpel failed for ' + normal_id + ' at ' + tumor_id + '\n') exit(1) log( loc, date_time() + 'Indel calling complete for pair ' + sample_pair + ' moving output files\n') mv_cmd = 'mv outdir/main/* .; rmdir outdir/main;' log(loc, date_time() + mv_cmd + '\n') call(mv_cmd, shell=True) sys.stderr.write(date_time() + 'Completed indel calls for ' + sample_pair + '\n') if dustmask_flag == 'Y': log(loc, date_time() + 'Filter dustmask flag given\n') check = filter_indel(bedtools, dustmask_bed, sample_pair, loc) if check != 0: sys.stderr.write(date_time() + 'Dustmask failed for ' + sample_pair + '\n') exit(1) else: log(loc, date_time() + 'Dustmask complete for ' + sample_pair + '\n') sys.stderr.write(date_time() + 'Indel call completed\n') return 0
def list_bam(project, align, sample): bam_dir = '/cephfs/PROJECTS/' + project + '/' + align + '/' + sample + '/BAMS/' find_bam_cmd = 'find ' + bam_dir + ' -name \'*.Aligned.toTranscriptome.out.bam\'' sys.stderr.write(date_time() + find_bam_cmd + '\nGetting BAM list\n') try: bam_find = subprocess.check_output(find_bam_cmd, shell=True).decode().rstrip('\n') bam_list = bam_find.split('\n') ct = len(bam_list) return bam_list, ct except: sys.stderr.write(date_time() + 'No bams found for ' + sample + '\n') exit(1)
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem): meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina" loc = log_dir + sample + ".mmu.star.pe.log" mk_srt_tmp = 'mkdir TMP' subprocess.call(mk_srt_tmp, shell=True) # split threads for star and novosort as well as memory nmem = 2 ncpu = 2 threads = int(threads) sthreads = threads if threads >= 10: if threads == 10: sthreads = 6 ncpu = 4 else: if threads % 2.0 == 0.0: sthreads = int(threads/2) ncpu = int(threads/2) else: sthreads = int(math.ceil(threads/2.0)) ncpu = int(math.floor(threads/2.0)) else: sthreads = int(sthreads) - 2 mem = int(mem) if mem > 42: nmem = mem - 40 star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \ + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\ + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \ "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \ "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \ "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + " | " + novosort + " - -n -c " \ + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \ + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \ + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz" log(loc, date_time() + star_cmd + '\n') try: subprocess.call(star_cmd, shell=True) except: log(loc, date_time() + 'Star alignment and filter against against mouse genome failed\n') exit(1) log(loc, date_time() + 'Filtering completed, replacing fastq file\n') rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \ + ';rm -rf TMP' check = subprocess.call(rn_fq, shell=True) if check != 0: log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n') exit(1) return 0
def annot_platypus(config_file, sample): (vep_tool, vep_cache, plugin_dir, fasta, threads, java, cadd, tx_index, project_dir, project, analysis) \ = parse_config(config_file) ana_dir = project_dir + project + '/' + analysis + '/' + sample pass_filter(ana_dir + '/' + sample) in_vcf = ana_dir + '/' + sample + '.germline_pass.vcf' out_vcf = sample + '.germline_pass.vep.vcf' buffer_size = '2000' if int(threads) > 1: threads = str(int(threads) - 1) run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd, sample, buffer_size, plugin_dir) sys.stderr.write(date_time() + 'Annotating sample ' + in_vcf + ' ' + run_cmd + '\n') # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart check = subprocess.Popen(run_cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) check_run = watch_mem(check, sample) if check_run != 0: buffer_size = str(int(buffer_size) / 2) clean_up = 'rm \'' + out_vcf + '*\'' sys.stderr.write(date_time() + 'VEP failed. Status of run was ' + str(check_run) + ' Trying smaller buffer size of ' + buffer_size + '\n' + clean_up + '\n') try: os.killpg(os.getpgid(check.pid), signal.SIGINT) except: sys.stderr.write( date_time() + 'Killing process failed. Might have already died for other reasons...\n' ) subprocess.call(clean_up, shell=True) run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd, sample, buffer_size, plugin_dir) sys.stderr.write(date_time() + 'Annotating sample ' + sample + in_vcf + '\n') check = subprocess.call(run_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'VEP failed for sample ' + sample + '\n') exit(1) else: sys.stderr.write(date_time() + 'VEP annotation of ' + in_vcf + ' successful!\n') check = gen_report(out_vcf, sample, tx_index) if check == 0: sys.stderr.write(date_time() + 'Summary table of germline calls completed!\n') else: sys.stderr.write(date_time() + 'Summary table for ' + out_vcf + ' FAILED!\n') return 1 return 0
def gen_report(vcf, out, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') mut_dict = create_mutect_ind(out) log(loc, date_time() + 'Created index for added mutect info\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t' 'codon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0] ann_list = [_.split('|') for _ in record.info['ANN']] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def cutadapter(sample, end1, end2, config_file): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' # designed to be run in a subdirectory, keep original file names sf1 = end1 sf2 = end2 end1 = os.path.basename(sf1) end2 = os.path.basename(sf2) if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.cutadapt.log' (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual, mqual) = parse_config(config_file) cut_th = threads if int(cut_th) >= 4: cut_th = str(int(int(threads) / 2)) cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \ + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \ + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc if r1adapt == '' and r2adapt == '': cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \ + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \ + ' >> ' + loc + ' 2>> ' + loc log(loc, date_time() + cutadapt_cmd + "\n") call(cutadapt_cmd, shell=True) return 0
def list_bam(project, project_dir, align_dir, sample): project_dir + project bam = project_dir + project + '/' + align_dir + '/' + sample + '/BAM/' + sample + '.merged.final.bam' check_file = os.path.isfile(bam) if not check_file: sys.stderr.write(date_time() + 'Merged bam ' + bam + ' not found.\n') return check_file
def mutect_merge_sort(config_file, sample_pair): # use fasta index to get sort order for file output (fai) = parse_config(config_file) fai_list = [] fai_fh = open(fai, 'r') for line in fai_fh: line = line.rstrip('\n') data = line.split('\t') fai_list.append(data[0]) fai_fh.close() # output files should be in directory named after sample-pairs dir_list = os.listdir('./') suffix_dict = {} for fn in dir_list: parts = fn.split('.') if len(parts) >= 3: if parts[2] == 'out' or parts[2] == 'vcf': suffix = '.'.join(parts[2:]) if suffix not in suffix_dict: suffix_dict[suffix] = [] suffix_dict[suffix].append(fn) merge_sort(suffix_dict, sample_pair, fai_list) sys.stderr.write(date_time() + 'File merging completed\n') return 0
def merge_filter_stats(project_dir, project, align_dir, lane_list): lanes = open(lane_list, 'r') head = '' data = [] print 'BID\tread group\ttotal alignment pairs(ap)\t% unambiguous ap\t% ambiguous ap\t% total ap filtered' \ '\t%total ap kept' for line in lanes: line = line.rstrip('\n') (bid, seqtype, lane_csv) = line.split('\t') for lane in lane_csv.split(', '): cur = project_dir + project + '/' + align_dir + '/' + bid + '/QC/' + bid + '_' + lane + '.runlog.txt' if os.path.isfile(cur): stat = open(cur, 'r') skip_lines(stat, 4) temp = [] group = process_line(stat, 2) # may need to adjust or switch to regex in a case % sign present unamb_pairs_pct = group[0][-1][:-1] amb_pairs_pct = group[1][-1][:-1] filt = str(100 - float(unamb_pairs_pct) - float(amb_pairs_pct)) kept = str(float(unamb_pairs_pct) + float(amb_pairs_pct)) temp.extend( (group[0][6], unamb_pairs_pct, amb_pairs_pct, filt, kept)) print bid + '\t' + lane + '\t' + '\t'.join(temp) stat.close() else: sys.stderr.write(date_time() + 'Could not find ' + cur + ' SKIP!\n') lanes.close() sys.stdout.write(head) for datum in data: sys.stdout.write(datum) return 0
def snpeff_pipe(config_file, sample_pairs, ref_mnt, cflag): # edit to grab from config max thread count max_t = 8 (java, snpeff, snpsift, report, dbsnp, intervals) = parse_config(config_file) dbsnp = ref_mnt + '/' + dbsnp intervals = ref_mnt + '/' + intervals fh = open(sample_pairs) mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) cmd_list = [] run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 ' for line in fh: line = line.rstrip('\n') (sample, tumor_id, normal_id) = line.split('\t') # run snpsift first, then snpeff run_report = report + ' -i ' + sample + '.out.keep.eff.vcf -c ' if cflag == 'n': run_report += intervals else: run_report += 'n' run_report += ' > ' + sample + '.vcf.keep.eff.xls' run_snp = run_snpsift + ' ' + sample + '.out.keep > ' + sample + '.out.keep.sift.vcf 2> LOGS/' + sample \ + '.snpeff.log;' + run_snpeff + ' ' + sample + '.out.keep.sift.vcf -v > ' + sample \ + '.out.keep.eff.vcf 2>> LOGS/' + sample + '.snpeff.log;' + run_report cmd_list.append(run_snp) job_manager(cmd_list, max_t) sys.stderr.write(date_time() + 'SNP annotation completed!\n') return 0
def parse_config(json_config): config_data = json.loads(open(json_config, 'r').read()) try: return config_data['tools']['slurm_wrap'], config_data['tools']['mojo_pipe'], \ config_data['params']['threads'], config_data['params']['ram'] except: try: sys.stderr.write(date_time() + 'Accessing keys failed. Attempting to output current keys:\n') for key in config_data: sys.stderr.write(key + '\n') for subkey in config_data[key]: sys.stderr.write(key + ":" + subkey + ":" + config_data[key][subkey] + '\n') exit(1) except: sys.stderr.write(date_time() + 'Could not read config file ' + json_config + '\n') exit(1)
def fastqc(fastqc_tool, sample, end1, end2, t): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.fastqc.log' fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc log(loc, date_time() + fastqc_cmd + "\n") check = call(fastqc_cmd, shell=True) # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score # didn't fit if check != 0: log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n') exit(1) return 0
def list_bam(project, project_dir, align_dir, sample): project_dir + project bam = project_dir + project + '/' + align_dir + '/' + sample + '/BAMS/' + sample + '.merged.transcriptome.bam' check_file = os.path.isfile(bam) if not check_file: sys.stderr.write(date_time() + 'Merged bam ' + bam + ' not found.\n') return check_file
def merge_filter_stats(project_dir, project, align_dir, lane_list): lanes = open(lane_list, 'r') head = '' data = [] print 'BID\tread group\ttotal alignment pairs(ap)\t% unambiguous ap\t% ambiguous ap\t% total ap filtered' \ '\t%total ap kept' for line in lanes: line = line.rstrip('\n') (bid, seqtype, lane_csv) = line.split('\t') for lane in lane_csv.split(', '): cur = project_dir + project + '/' + align_dir + '/' + bid + '/QC/' + bid + '_' + lane + '.runlog.txt' if os.path.isfile(cur): stat = open(cur, 'r') skip_lines(stat, 4) temp = [] group = process_line(stat, 2) # may need to adjust or switch to regex in a case % sign present unamb_pairs_pct = group[0][-1][:-1] amb_pairs_pct = group[1][-1][:-1] filt = str(100-float(unamb_pairs_pct)-float(amb_pairs_pct)) kept = str(float(unamb_pairs_pct) + float(amb_pairs_pct)) temp.extend((group[0][6], unamb_pairs_pct, amb_pairs_pct, filt, kept)) print bid + '\t' + lane + '\t' + '\t'.join(temp) stat.close() else: sys.stderr.write(date_time() + 'Could not find ' + cur + ' SKIP!\n') lanes.close() sys.stdout.write(head) for datum in data: sys.stdout.write(datum) return 0
def align_stats(sample): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.aln.log' log(loc, date_time() + "Converting to table summary format\n") fh = open(sample + '/' + 'align_summary.txt', 'r') fo = open(sample + '.align.txt', 'w') fo.write( 'Sample\tMean insert size estimate(10k reads)\tStd dev read insert size estimate(10 k reads)\tStarting left reads\t% mapped\tmultimapped(mm)\tgt 20 mm\tStarting right reads\t% mapped\t% mm\tgt 20 mm\tOverall map rate\tAligned pairs\t% mm\t% discordant\t% condordant\n' + sample + '\t') fi = open(sample + '_subset.insert_metrics.hist') for i in range(0, 7, 1): skip = next(fi) stats = next(fi) fi.close() stat = stats.split('\t') fo.write('\t'.join([str(int(float(stat[4]))), str(int(float(stat[5])))])) next(fh) lstart = next(fh) m = re.search('(\d+)\n$', lstart) fo.write('\t' + m.group(1)) pct = next(fh) m = re.search('\(\s*(\S+) of input\)\n', pct) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm) fo.write('\t' + m.group(1) + '\t' + m.group(2)) next(fh) rstart = next(fh) m = re.search('(\d+)\n$', rstart) fo.write('\t' + m.group(1)) pct = next(fh) m = re.search('\(\s*(\S+) of input\)\n', pct) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm) fo.write('\t' + m.group(1) + '\t' + m.group(2)) ovr = next(fh) m = re.search('\s*(^\S+)', ovr) fo.write('\t' + m.group(1)) next(fh) aln = next(fh) m = re.search('(\d+)\n$', aln) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\) have', mm) fo.write('\t' + m.group(1)) dc = next(fh) m = re.search('\(\s*(\S+)\) are', dc) fo.write('\t' + m.group(1)) cc = next(fh) m = re.search('^\s*(\S+)', cc) fo.write('\t' + m.group(1) + '\n') fo.close return 0
def flagstats(samtools_tool, sample): # test for sorted bam, otherwise use unsorted bam raw_bam = sample + ".srt.bam" res_file = sample + ".srt.bam.flagstats" if os.path.isfile(raw_bam) == False: raw_bam = sample + ".bam" res_file = sample + ".bam.flagstats" flagstats_cmd = samtools_tool + " flagstat " + raw_bam + " > " + res_file sys.stderr.write(date_time() + flagstats_cmd + "\n") Popen(flagstats_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True) flagstats_cmd = samtools_tool + " flagstat " + sample + ".rmdup.srt.bam > " + sample + ".rmdup.srt.bam.flagstats" sys.stderr.write(date_time() + flagstats_cmd + "\n") Popen(flagstats_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True)
def mojo_pipe(sample, config_file, fq1, fq2): (project_dir, project, align_dir, mojo, m_config, cores, mem, user, group) = parse_config(config_file) fq_dir = project_dir + project + '/' + align_dir + '/' + sample + '/TRIMMED_FQ/' out_dir = project_dir + project + '/' + align_dir + '/' + sample + '/MOJO/' os.mkdir(out_dir) loc = out_dir + sample + '.mojo_run.log' log(loc, date_time() + 'Made output directory ' + out_dir + '\n') log(loc, date_time() + 'Changing to fastq directory ' + fq_dir + '\n') os.chdir(fq_dir) run_mojo = mojo + ' --config ' + m_config + ' --sample_name ' + sample + ' --output_dir ' + out_dir + ' --fq1 ' \ + fq1 + ' --fq2 ' + fq2 + ' --cores ' + cores + ' --mem ' + mem log(loc, date_time() + 'Running MOJO with command ' + run_mojo + '\n') try: subprocess.call(run_mojo, shell=True) log(loc, date_time() + 'MOJO complete! Setting acls\n') check = set_acls(out_dir, user, group) if check == 0: log(loc, date_time() + 'Setting acls complete. Pipeline complete!\n') else: log(loc, date_time() + 'Setting acls failed. Check logs!\n') except: sys.stderr.write(date_time() + 'MOJO failed! Check logs in ' + loc + '\n') return 1
def pre_report(mode, bam, sample, pos, config_file, ref_mnt): (samtools_tool, samtools_ref) = parse_config(config_file) samtools_ref = ref_mnt + '/' + samtools_ref create_pos_ref(pos) sys.stderr.write(date_time() + 'Creating mpileup with samtools\n') pre_rpt_cmd = samtools_tool + ' mpileup -D -d 500000 -l pos_list.txt -f ' + samtools_ref out = '' if mode == 'b': out = 'batch_pileup.txt' pre_rpt_cmd += ' -b ' + bam + ' > ' + out else: out = sample + '_pileup.txt' pre_rpt_cmd += ' ' + bam + ' > ' + out sys.stderr.write(date_time() + pre_rpt_cmd + "\n") try: subprocess.call(pre_rpt_cmd, shell=True) except: sys.stderr.write(date_time() + 'Pileup failed\n') cov = {} index = {} sys.stderr.write(date_time() + 'Parsing mpileup output\n') samp_list = parse_pileup(out, pos, sample, mode, cov, index) sys.stderr.write( date_time() + 'Calculating means and standard deviations of base quality scores\n') calc_values(cov) sys.stderr.write(date_time() + 'Generating report\n') gen_report(cov, index, samp_list) sys.stderr.write(date_time() + 'Report complete\n') return 0
def downsample_pipe(bam_list, config_file, depth): (samtools, threads) = parse_config(config_file) for bam in open(bam_list): sys.stderr.write(date_time() + 'Setting up for ' + bam) bam = bam.rstrip('\n') bam_root = bam.replace('.bam', '') bam_dir = os.path.dirname(bam) # cleanup sample name, sloppy i know! qc_root = bam_root.replace('BAMS', 'QC', 1) qc_root = qc_root.replace('.srt', '', 1) qc_root = qc_root.replace('.Aligned.toTranscriptome.out', '', 1) sys.stderr.write(date_time() + 'Calculating downsample fraction\n') frac = get_from_depth(qc_root, depth) # submit to job queue downsample_bam(samtools, bam, frac, bam_dir, threads) #cmd = ' '.join(('sbatch', '-c', threads, '--oversubscribe', downsample_bam, '-b ', bam, '-f', frac, '-o ', # bam_dir, '-t', threads, '-s', samtools)) sys.stderr.write(date_time() + 'Submitting to queue ' + bam + '\n')
def picard_rmdup(java_tool, picard_tool, picard_tmp, sample, log_dir, ram): picard_rmdup_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " MarkDuplicates CREATE_INDEX=true " \ "TMP_DIR=" + picard_tmp + " REMOVE_DUPLICATES=true ASSUME_SORTED=true " \ "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 INPUT=" + sample + ".srt.bam OUTPUT=" + sample \ + ".rmdup.srt.bam METRICS_FILE=" + sample + ".rmdup.srt.metrics VALIDATION_STRINGENCY=LENIENT " \ "> " + log_dir + sample + ".picard.rmdup.pe.log 2>&1" log(log_dir + sample + ".picard.rmdup.pe.log", date_time() + picard_rmdup_cmd + "\n") call(picard_rmdup_cmd, shell=True)
def get_bam_name(bnid, project_dir, project, align_dir): bam_dir = project_dir + project + '/' + align_dir + '/' + bnid + '/BAM/' bam = bam_dir + bnid + '.merged.final.bam' bai = bam_dir + bnid + '.merged.final.bai' f = 0 if not os.path.isfile(bam): sys.stderr.write(date_time() + 'Bam not found in ' + bam_dir + '\n') f = 1 return f, bam, bai if not os.path.isfile(bai): bai = bam_dir + bnid + '.merged.final.bam.bai' if not os.path.isfile(bai): sys.stderr.write(date_time() + 'Bam index file for ' + bnid + ' not found! Please index first\n') f = 1 return f, bam, bai return f, bam, bai