def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib #from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib # TODO: Merge with merge_variants_snp.py. #CALLERS = [ # "gatk", "platypus", "varscan", # ] vcf_paths = [x.identifier for x in antecedents] nodes = [x.data for x in antecedents] CALLERS = [x.attributes["caller"] for x in nodes] assert len(CALLERS) == len(vcf_paths) filelib.safe_mkdir(out_path) metadata = {} # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file) jobs = [] for i, caller in enumerate(CALLERS): inpath = vcf_paths[i] caller_h = hashlib.hash_var(caller) vcf_files = filelib.list_files_in_path( inpath, endswith=".vcf", toplevel_only=True) for file_ in vcf_files: # IN_FILE: <inpath>/<sample>.vcf # OUT_FILE: <out_path>/<caller>.vcf/<sample>.vcf p, sample, e = mlib.splitpath(file_) assert e == ".vcf" out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h) out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample) x = filelib.GenericObject( sample=sample, caller=caller, out_vcf_path=out_vcf_path, in_vcf_file=file_, out_vcf_file=out_vcf_file) jobs.append(x) # Make sure the same samples are found in all callers. caller2samples = {} for j in jobs: if j.caller not in caller2samples: caller2samples[j.caller] = [] caller2samples[j.caller].append(j.sample) comp_samples = None for caller, samples in caller2samples.iteritems(): samples = sorted(samples) if comp_samples is None: comp_samples = samples assert comp_samples == samples, "%s %s" % (comp_samples, samples) for j in jobs: filelib.safe_mkdir(j.out_vcf_path) os.symlink(j.in_vcf_file, j.out_vcf_file) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_node, gene_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) assert bam_filenames, "No bam files found." metadata = {} # Make output filenames. p, r, e = mlib.splitpath(gtf_file) bed_file = "%s.bed" % r # Make bed file. alignlib.gtf_to_bed(gtf_file, bed_file) #bed_file = "/data/jchang/biocore/gtf02.txt" # Figure out the orientation. x = get_paired_stranded_rseqc(bed_file, bam_filenames[0]) single_or_paired, stranded, frac_failed, frac_first, frac_second = x x = mlib.Stranded(single_or_paired, stranded, frac_failed, frac_first, frac_second) mlib.write_stranded(x, outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] for bam_filename in bam_filenames: x = count_duplicates, (bam_filename,), {} jobs.append(x) results = parallel.pyfun(jobs, num_procs=num_cores) metadata["num_cores"] = num_cores assert len(results) == len(bam_filenames) handle = open(outfile, 'w') header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated" print >>handle, "\t".join(header) for i in range(len(bam_filenames)): x, sample, x = mlib.splitpath(bam_filenames[i]) total_reads, dup_reads = results[i] perc_dup = float(dup_reads) / total_reads * 100 perc_dup = "%.2f" % perc_dup x = sample, dup_reads, total_reads, perc_dup print >>handle, "\t".join(map(str, x)) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils as mlib import merge_vcf_folder vcffolders_node = antecedents filelib.safe_mkdir(out_path) metadata = {} x = os.listdir(vcffolders_node.identifier) x = [x for x in x if x.endswith(".vcf")] assert x, "No VCF folders found: %s" % vcffolders_node.identifier x = [os.path.join(vcffolders_node.identifier, x) for x in x] vcf_folders = x jobs = [] for folder in vcf_folders: path, root, ext = mlib.splitpath(folder) assert ext == ".vcf" caller = root vcf_filenames = filelib.list_files_in_path(folder, endswith=".vcf", toplevel_only=True) assert vcf_filenames, "No .vcf files: %s" % folder out_filename = os.path.join(out_path, "%s.vcf" % root) tmp_path = "%s.indexed.vcf" % caller x = filelib.GenericObject(caller=caller, vcf_filenames=vcf_filenames, out_filename=out_filename, tmp_path=tmp_path) jobs.append(x) for j in jobs: m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames, j.out_filename, num_cores, j.tmp_path) if "commands" not in metadata: metadata["commands"] = [] metadata["commands"].extend(m["commands"]) x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib in_vcf_node, bf_vcf_node = antecedents in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier, endswith=".vcf", toplevel_only=True) bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier, endswith=".vcf", toplevel_only=True) filelib.safe_mkdir(out_path) metadata = {} common_only = mlib.get_user_option(user_options, "backfill_common_only", allowed_values=["no", "yes"], not_empty=True) in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames] bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames] # Make sure there are no duplicate sample names. x1 = {}.fromkeys(in_vcf_samples).keys() x2 = {}.fromkeys(bf_vcf_samples).keys() assert len(in_vcf_samples) == len(x1), "Duplicate samples" assert len(bf_vcf_samples) == len(x2), "Duplicate samples" # Find the samples. common = [x for x in in_vcf_samples if x in bf_vcf_samples] in_only = [x for x in in_vcf_samples if x not in common] bf_only = [x for x in bf_vcf_samples if x not in common] assert common, "No common samples." pretty_in = parselib.pretty_list(in_only, max_items=5) pretty_bf = parselib.pretty_list(bf_only, max_items=5) if common_only == "no": assert not (in_only and bf_only), \ "Extra samples in both sets:\n%s\n%s" % ( pretty_in, pretty_bf) assert not in_only, "Target VCF file has extra samples: %s" % \ pretty_in assert not bf_only, "Source VCF file has extra samples: %s." % \ pretty_bf SAMPLES = common # list of sample, in_vcf_filename, bf_vcf_filename, out_filename jobs = [] for sample in SAMPLES: assert sample in in_vcf_samples assert sample in bf_vcf_samples i = in_vcf_samples.index(sample) j = bf_vcf_samples.index(sample) in_filename = in_vcf_filenames[i] bf_filename = bf_vcf_filenames[j] out_filename = os.path.join(out_path, "%s.vcf" % sample) x = sample, in_filename, bf_filename, out_filename jobs.append(x) jobs2 = [] for x in jobs: sample, in_filename, bf_filename, out_filename = x fn = backfill_vcf args = in_filename, bf_filename, out_filename keywds = {} jobs2.append((fn, args, keywds)) #num_cores = 1 parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile, # fixed_outfile, filtered_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) orig_outfile = opj(out_path, "%s.raw" % sample) fix_outfile = opj(out_path, "%s.vcf" % sample) #filter_outfile = opj(out_path, "%s.vcf" % sample) x = cancer_sample, normal_bamfile, cancer_bamfile, \ orig_outfile, fix_outfile x = filelib.GenericObject(cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, orig_outfile=orig_outfile, fix_outfile=fix_outfile) jobs.append(x) # python /usr/local/museq/classify.py \ # normal:test31/normal.bam tumour:test31/tumor.bam \ # reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # model:/usr/local/museq/model_v4.1.2.npz \ # --config /usr/local/museq/metadata.config \ # -o test51.vcf opj = os.path.join museq = mlib.get_config("museq", assert_exists=True) classify_py = opj(museq, "classify.py") model_file = opj(museq, "model_v4.1.2.npz") config_file = opj(museq, "metadata.config") filelib.assert_exists_nz(classify_py) filelib.assert_exists_nz(model_file) filelib.assert_exists_nz(config_file) # museq's config file generates a broken VCF file. Fix it. fixed_config_file = "fixed.config" fix_config_file(config_file, fixed_config_file) # Generate the commands. sq = mlib.sq commands = [] for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x x = [ "python", # should allow user to specify python sq(classify_py), sq("normal:%s" % j.normal_bamfile), sq("tumour:%s" % j.cancer_bamfile), sq("reference:%s" % ref.fasta_file_full), sq("model:%s" % model_file), "--config", sq(fixed_config_file), "-o", sq(j.orig_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. On Thunderbolts test, # took < 1 Gb. nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # JointSNVMix produces non-standard VCF files. Fix this so it # will work with other programs downstream. for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile) # Filter each of the VCF files. #for x in jobs: # cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x # filter_by_vartype(vartype, fix_outfile, vcf_outfile) #metadata["filter"] = vartype #x = [x[-1] for x in jobs] x = [j.fix_outfile for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "MuSE %s" % alignlib.get_muse_version() wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) dbsnp_file = mlib.get_user_option(user_options, "muse_dbsnp_vcf", not_empty=True, check_file=True) # Make sure dbsnp_file is compressed and indexed. assert dbsnp_file.endswith(".vcf.gz"), \ "muse_dbsnp_vcf must be bgzip compressed." x = "%s.tbi" % dbsnp_file assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed." # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, # logfile1, logfile2) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) muse_call_stem = opj(out_path, "%s.call" % cancer_sample) muse_call_file = "%s.MuSE.txt" % muse_call_stem raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample) vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample) log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 jobs.append(x) # Generate the commands. # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\ # bam04/196B-MG.bam bam04/PIM001_G.bam # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \ # -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz MuSE = mlib.findbin("muse") sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "call", "-O", muse_call_stem, "-f", sq(ref.fasta_file_full), cancer_bamfile, normal_bamfile, ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile1) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[8] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the call files are created and not empty. call_files = [x[5] for x in jobs] filelib.assert_exists_nz_many(call_files) # Run the "sump" step. commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "sump", "-I", sq(muse_call_file), ] assert wgs_or_wes in ["wgs", "wes"] if wgs_or_wes == "wgs": x += ["-G"] else: x += ["-E"] x += [ "-O", sq(raw_vcf_outfile), "-D", sq(dbsnp_file), ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile2) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = metadata["commands"] + commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[9] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the raw files are created and not empty. vcf_files = [x[6] for x in jobs] filelib.assert_exists_nz_many(vcf_files) # Fix the files. commands = [] # Should be python commands. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile x = alignlib.clean_muse_vcf, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Delete the log_outfiles if empty. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x if os.path.exists(log_outfile1): os.unlink(log_outfile1) if os.path.exists(log_outfile2): os.unlink(log_outfile2) # Make sure output VCF files exist. x = [x[7] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals" ] cosmic_file = mlib.get_user_option(user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option(user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(normal_sample=normal_sample, cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, vcf_outfile=vcf_outfile, log_outfile=log_outfile) jobs.append(x) # java -jar GenomeAnalysisTK.jar \ # -T MuTect2 \ # -R reference.fasta \ # -I:tumor tumor.bam \ # -I:normal normal.bam \ # [--dbsnp dbSNP.vcf] \ # [--cosmic COSMIC.vcf] \ # [-L targets.interval_list] \ # -o output.vcf # Generate the commands. sq = mlib.sq commands = [] for j in jobs: UNHASHABLE = [ ("I:normal", sq(normal_bamfile)), ("I:tumor", sq(cancer_bamfile)), # --dbsnp and --cosmic use two dashes, for some # reason. Since make_GATK_command only uses one dash, # add one manually. ("-dbsnp", sq(dbsnp_file)), ("-cosmic", sq(cosmic_file)), ] x = alignlib.make_GATK_command( T="MuTect2", R=sq(ref.fasta_file_full), L=sq(interval_node.identifier), o=sq(j.vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, j.log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, j in enumerate(jobs): # Pull out the error lines. x = [x for x in open(j.log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x.vcf_outfile for x in jobs] filelib.assert_exists_many(x) # Mutect2 names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for j in jobs: call_somatic_varscan._fix_normal_cancer_names( j.vcf_outfile, j.normal_sample, j.cancer_sample) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import parselib from Betsy import module_utils as mlib mpileup_node, nc_node = antecedents mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier, endswith=".pileup") assert mpileup_filenames, "No .pileup files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) #ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] sample2pufile = {} # sample -> mpileup filename for filename in mpileup_filenames: path, sample, ext = mlib.splitpath(filename) sample2pufile[sample] = filename # Make sure files exist for all the samples. all_samples = [] for (normal_sample, cancer_sample) in nc_match: if normal_sample not in all_samples: all_samples.append(normal_sample) if cancer_sample not in all_samples: all_samples.append(cancer_sample) missing = [x for x in all_samples if x not in sample2pufile] x = parselib.pretty_list(missing, max_items=5) assert not missing, "Missing BAM files for samples: %s" % x # list of (sample, normal_pileup, cancer_pileup, # tmp1_normal, tmp1_cancer, log_filename, out_filename) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_pileup = sample2pufile[normal_sample] cancer_pileup = sample2pufile[cancer_sample] p, sample, ext = mlib.splitpath(cancer_pileup) tmp1_normal = opj(out_path, "%s.normal.tmp1" % sample) tmp1_cancer = opj(out_path, "%s.cancer.tmp1" % sample) log_filename = opj(out_path, "%s.log" % sample) out_filename = opj(out_path, "%s.vcf" % sample) x = sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename jobs.append(x) # VarScan will generate a "Parsing Exception" if there are 0 # reads in a location. Will be either "0" or blank. Filter # those lines out. sq = parallel.quote commands = [] for x in jobs: sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename = x x1 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (normal_pileup, tmp1_normal) x2 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (cancer_pileup, tmp1_cancer) commands.extend([x1, x2]) parallel.pshell(commands, max_procs=num_cores) x = [x[3] for x in jobs] + [x[4] for x in jobs] filelib.assert_exists_nz_many(x) # java -jar VarScan.jar somatic [normal_pileup] [tumor_pileup] # [output] OPTIONS varscan = mlib.findbin("varscan_jar") # Use parameters from: # Using VarScan 2 for Germline Variant Calling and Somatic # Mutation Detection # Make a list of commands. commands = [] for x in jobs: sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename = x x = [ "java", "-jar", sq(varscan), "somatic", sq(tmp1_normal), sq(tmp1_cancer), sample, "--min-coverage", 10, "--min-avg-qual", 15, "--min-normal-coverage", 10, "--min-tumor-coverage", 10, "--min-var-freq", 0.05, "--somatic-p-value", 0.05, "--output-vcf", 1, ] x = " ".join(map(str, x)) x = "%s >& %s" % (x, log_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) x = [x[5] for x in jobs] filelib.assert_exists_nz_many(x) # Files in out_path can get very big. Clean them up. # <sample>.normal.tmp1 Very big (10's Gb). # <sample>.cancer.tmp1 Very big (10's to 100 Gb). for x in jobs: sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename = x if os.path.exists(tmp1_normal): os.unlink(tmp1_normal) if os.path.exists(tmp1_cancer): os.unlink(tmp1_cancer) # Copy the final file to the right place. for x in jobs: sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename = x # Will be written in current directory. varscan_out = "%s.snp.vcf" % sample if vartype == "indel": varscan_out = "%s.indel.vcf" % sample filelib.assert_exists(varscan_out) shutil.copy2(varscan_out, out_filename) # VarScan names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for x in jobs: sample, normal_sample, cancer_sample, \ normal_pileup, cancer_pileup, \ tmp1_normal, tmp1_cancer, log_filename, out_filename = x _fix_normal_cancer_names(out_filename, normal_sample, cancer_sample)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # vcf_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile jobs.append(x) # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # test31/tumor.bam test31/normal.bam test41.vcf somaticsniper = mlib.get_config("somaticsniper", which_assert_file=True) # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x x = [ sq(somaticsniper), "-q", 1, "-Q", 15, "-G", "-L", "-F", "vcf", "-f", sq(ref.fasta_file_full), sq(cancer_bamfile), sq(normal_bamfile), sq(vcf_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # SomaticSniper names the samples "NORMAL" and "TUMOR". # Replace them with the actual names. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x call_somatic_varscan._fix_normal_cancer_names( vcf_outfile, normal_sample, cancer_sample) x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import alignlib from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_files, "I could not find any FASTQ files." ref = alignlib.create_reference_genome(reference_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version() # Figure out whether to align to genome or transcriptome. x = out_attributes["align_to"] assert x in ["genome", "transcriptome"] align_to_genome = (x == "genome") # RSEM makes files: # <sample_name>.genome.bam # <sample_name>.transcript.bam # <sample_name>.genes.results # <sample_name>.isoforms.results # <sample_name>.stat # # Does not work right if there is a space in the sample name. # Therefore, give a hashed sample name, and then re-name # later. # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sample_h = hashlib.hash_var(sample) x1, x2, x3 = mlib.splitpath(pair1) x = "%s%s" % (hashlib.hash_var(x2), x3) pair1_h = os.path.join(out_path, x) if pair2: x1, x2, x3 = mlib.splitpath(pair2) x = "%s%s" % (hashlib.hash_var(x2), x3) pair2_h = os.path.join(out_path, x) results_filename = os.path.join(out_path, "%s.genes.results" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, sample_h=sample_h, pair1=pair1, pair2=pair2, pair1_h=pair1_h, pair2_h=pair2_h, results_filename=results_filename, log_filename=log_filename) jobs.append(x) # Make sure hashed samples are unique. seen = {} for j in jobs: assert j.sample_h not in seen, \ "Dup (%d): %s" % (len(jobs), j.sample_h) assert j.pair1_h not in seen assert j.pair2_h not in seen seen[j.sample_h] = 1 seen[j.pair1_h] = 1 seen[j.pair2_h] = 1 # Symlink the fastq files. for j in jobs: os.symlink(j.pair1, j.pair1_h) if j.pair2: os.symlink(j.pair2, j.pair2_h) s2fprob = { "unstranded": None, "firststrand": 0.0, "secondstrand": 1.0, } assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \ stranded.stranded forward_prob = s2fprob[stranded.stranded] # How much memory for bowtie. May need to increase this if # there are lots of memory warnings in the log files: # Warning: Exhausted best-first chunk memory for read # ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1 # (patid 2076693); skipping read # Default is 64. # Seems like too high a value can cause problems. #chunkmbs = 4*1024 # Generates warnings. chunkmbs = 512 # Get lots of warnings with bowtie: # Warning: Detected a read pair whose two mates have different names # Use STAR aligner instead. use_STAR = True sq = parallel.quote commands = [] for j in jobs: # Debug: If the results file exists, don't run it again. if filelib.exists_nz(j.results_filename) and \ filelib.exists(j.log_filename): continue # If using the STAR aligner, then most memory efficient # way is to let STAR take care of the multiprocessing. nc = max(1, num_cores / len(jobs)) if use_STAR: nc = num_cores keywds = {} if use_STAR: keywds["align_with_star"] = True else: keywds["align_with_bowtie2"] = True x = alignlib.make_rsem_command(ref.fasta_file_full, j.sample_h, j.pair1_h, fastq_file2=j.pair2_h, forward_prob=forward_prob, output_genome_bam=align_to_genome, bowtie_chunkmbs=chunkmbs, num_threads=nc, **keywds) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores # Need to run in out_path. Otherwise, files will be everywhere. nc = num_cores if use_STAR: nc = 1 parallel.pshell(commands, max_procs=nc, path=out_path) # Rename the hashed sample names back to the original unhashed # ones. files = os.listdir(out_path) rename_files = [] # list of (src, dst) for j in jobs: if j.sample == j.sample_h: continue for f in files: if not f.startswith(j.sample_h): continue src = os.path.join(out_path, f) x = j.sample + f[len(j.sample_h):] dst = os.path.join(out_path, x) rename_files.append((src, dst)) for src, dst in rename_files: filelib.assert_exists(src) os.rename(src, dst) # Delete the symlinked fastq files. for j in jobs: filelib.safe_unlink(j.pair1_h) filelib.safe_unlink(j.pair2_h) # Make sure the analysis completed successfully. x1 = [x.results_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib svm_node, vcf_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) metadata = {} # 1. vcf_filenames # 2. parsed_snpeff_files one for each VCF file # 3. merged_snpeff_file just one file # 4. clean_snpeff_file clean up the annotations to final form # 5. outfile merged_snpeff_file = "snpeff.merged.txt" cleaned_snpeff_file = "snpeff.clean.txt" jobs = [] for vcf_filename in vcf_filenames: path, caller, ext = mlib.splitpath(vcf_filename) parsed_snpeff_file = "%s.parsed.txt" % caller j = filelib.GenericObject( caller=caller, vcf_filename=vcf_filename, parsed_snpeff_file=parsed_snpeff_file, ) jobs.append(j) # Parse each of the snpeff files. commands = [] for j in jobs: args = j.vcf_filename, j.parsed_snpeff_file # Debugging. If this file exists, do not generate it # again. if os.path.exists(j.parsed_snpeff_file): continue x = parse_snpeff_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores # Merge the parsed files. x = [j.parsed_snpeff_file for j in jobs] x = [x for x in x if os.path.exists(x)] parsed_files = x # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(merged_snpeff_file): merge_parsed_files(parsed_files, merged_snpeff_file) # Clean up the snpEff file. Coordinates should be unique. # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(cleaned_snpeff_file): clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file) # Merge the snpEff annotations into the SimpleVariantMatrix. add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out Strelka version. skip_depth_filter = False x = mlib.get_user_option(user_options, "strelka_skip_depth_filter", allowed_values=["no", "yes"], not_empty=True) if x == "yes": skip_depth_filter = True assert "vartype" in out_attributes, "Missing attribute: vartype" x = out_attributes["vartype"] assert x in ["snp", "indel"] vartype = x # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # Make sure each cancer sample is unique. Otherwise, the # analysis directories will conflict. tumor_samples = [x[-1] for x in nc_match] dups = {} for i in range(1, len(tumor_samples)): if tumor_samples[i] in tumor_samples[:i]: dups[tumor_samples[i]] = 1 assert not dups, "NormalCancerFile contains multiple instances of: %s"\ % ", ".join(sorted(dups)) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # config_file, output_dir opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) config_file = opj(out_path, "config.%s.ini" % cancer_sample) analysis_path = opj(out_path, "analysis.%s" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path jobs.append(x) # Make each of the config files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x _make_config_file(config_file, skip_depth_filter=skip_depth_filter) # Make the analysis directories. jobs2 = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x fn = _make_analysis_directory args = (analysis_path, config_file, ref.fasta_file_full, normal_bamfile, cancer_bamfile) keywds = None jobs2.append((fn, args, keywds)) parallel.pyfun(jobs2, num_procs=num_cores) # Run the analysis. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x cmd = "make -j %d" % num_cores parallel.sshell(cmd, path=analysis_path) metadata["num_cores"] = num_cores # Make sure files exists. x = [x[-1] for x in jobs] x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x] filelib.assert_exists_nz_many(x) # Clean the VCF files and save into the out_path. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x # <analysis_path>/results/all.somatic.snvs.vcf # <analysis_path>/results/all.somatic.indels.vcf vartype2file = { "snp": "all.somatic.snvs.vcf", "indel": "all.somatic.indels.vcf", } assert vartype in vartype2file x = vartype2file[vartype] src_file = os.path.join(analysis_path, "results", x) dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample) alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file, dst_file) #metadata["commands"] = commands return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sai_node, orient_node, sample_node, reference_node = \ antecedents fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_node.identifier) sai_path = sai_node.identifier assert filelib.dir_exists(sai_path) orient = mlib.read_orientation(orient_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Technically, doesn't need the SampleGroupFile, since that's # already reflected in the sai data. But better, because the # sai data might not always be generated by BETSY. # Find the merged fastq files. # Find the sai files. sai_filenames = filelib.list_files_in_path( sai_path, endswith=".sai", case_insensitive=True) assert sai_filenames, "No .sai files." bwa = mlib.findbin("bwa") # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq> # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai> # <input_1.fq> <input_2.fq> > # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam) # all full paths jobs = [] for x in fastq_files: sample, pair1_fq, pair2_fq = x # The sai file should be in the format: # <sai_path>/<sample>.sai Single end read # <sai_path>/<sample>_1.sai Paired end read # <sai_path>/<sample>_2.sai Paired end read # Look for pair1_sai and pair2_sai. pair1_sai = pair2_sai = None for sai_filename in sai_filenames: p, s, e = mlib.splitpath(sai_filename) assert e == ".sai" if s == sample: assert not pair1_sai pair1_sai = sai_filename elif s == "%s_1" % (sample): assert not pair1_sai pair1_sai = sai_filename elif s == "%s_2" % (sample): assert not pair2_sai pair2_sai = sai_filename assert pair1_sai, "Missing .sai file: %s" % sample if pair2_fq: assert pair2_sai, "Missing .sai file 2: %s" % sample if pair2_sai: assert pair2_fq, "Missing .fq file 2: %s" % sample sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename jobs.append(x) orientation = orient.orientation #orientation = sample_node.data.attributes["orientation"] assert orientation in ["single", "paired_fr", "paired_rf"] # Make a list of bwa commands. sq = mlib.sq commands = [] for x in jobs: sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename = x if orientation == "single": assert not pair2_fq assert not pair2_sai samse = "samse" if orientation.startswith("paired"): samse = "sampe" x = [ sq(bwa), samse, "-f", sq(sam_filename), sq(ref.fasta_file_full), ] if orientation == "single": x += [ sq(pair1_sai), sq(pair1_fq), ] else: y = [ sq(pair1_sai), sq(pair2_sai), sq(pair1_fq), sq(pair2_fq), ] if orientation == "paired_rf": y = [ sq(pair2_sai), sq(pair1_sai), sq(pair2_fq), sq(pair1_fq), ] x += y x += [ ">&", sq(log_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import ngslib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} features_bed = mlib.get_user_option(user_options, "features_bed", check_file=True) if features_bed: metadata["features_bed"] = features_bed # Applies to genomecov. min_coverage = user_options.get("ignore_coverage_below") if min_coverage == "": min_coverage = None if min_coverage is not None: min_coverage = int(min_coverage) assert min_coverage >= 0 metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version() metadata["num_cores"] = num_cores metadata["commands"] = [] # Set up the filenames. # list of ( # sample, # orig_bam_filename, Original bam filename. # bam_filename, bam file, after filtering out unmapped reads. # genomecov_filename, Generated by genomecov. Histogram. # histo_datafile, Data file to generate histogram (from cov). # histo_plotfile, Histogram plot. # histo_prismfile, To make histogram in PRISM. # # ONLY USED IF features_bed # intervallist_file, Made from BED file. # cov_filename, Generated by Picard. # targetcov_filename, Generated by Picard. Per target coverage. # log_filename, Output from Picard. # ) opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" clean_bam_filename = opj(out_path, "%s.bam" % sample) assert clean_bam_filename != bam_filename genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample) histo_datafile = opj(out_path, "%s.histo.txt" % sample) histo_plotfile = opj(out_path, "%s.histo.png" % sample) histo_prismfile = opj(out_path, "%s.prism.txt" % sample) intervallist_file = opj(out_path, "%s.interval.txt" % sample) cov_filename = opj(out_path, "%s.coverage.txt" % sample) targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample) log_filename = opj(out_path, "%s.picard.log" % sample) x = filelib.GenericObject(sample=sample, orig_bam_filename=bam_filename, bam_filename=clean_bam_filename, genomecov_filename=genomecov_filename, histo_datafile=histo_datafile, histo_plotfile=histo_plotfile, histo_prismfile=histo_prismfile, intervallist_file=intervallist_file, cov_filename=cov_filename, targetcov_filename=targetcov_filename, log_filename=log_filename) #x = sample, bam_filename, genomecov_filename, \ # histo_datafile, histo_plotfile, histo_prismfile, \ # intervallist_file, cov_filename, targetcov_filename, \ # log_filename jobs.append(x) # Remove unmapped reads from the BAM files. # Need to remove the unmapped reads or Picard might complain: # Exception in thread "main" # htsjdk.samtools.SAMFormatException: SAM validation error: # ERROR: Record 154286082, Read name # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0 # for unmapped read. # # This can happen with BWA generated alignments. cmds = [] for x in jobs: x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename) cmds.append(x) parallel.pshell(cmds, max_procs=num_cores) x = [x.bam_filename for x in jobs] filelib.assert_exists_nz_many(x) # Generate the intervallist_file(s). if features_bed: cmds = [] for x in jobs: args = x.intervallist_file, features_bed, x.bam_filename x = _make_intervallist_file, args, {} cmds.append(x) parallel.pyfun(cmds, num_procs=num_cores) # Make the commands to run picard. if features_bed: commands = [] for x in jobs: x = _make_calculatehsmetrics_command( x.intervallist_file, x.bam_filename, x.cov_filename, x.targetcov_filename, ref.fasta_file_full, x.log_filename) commands.append(x) metadata["commands"].append(commands) parallel.pshell(commands, max_procs=num_cores) x1 = [x.cov_filename for x in jobs] x2 = [x.targetcov_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) # Use genomecov to count read depth. x = _run_genomecov(jobs, ref_node.identifier, num_cores) metadata["commands"].append(x) # Summarize the average read depth. summary_file = opj(out_path, "summary.xls") _summarize_average_read_depth(jobs, min_coverage, summary_file) # Make histograms of the distribution of the read depth for # each sample. for x in jobs: _make_histo_file(x.genomecov_filename, x.histo_datafile) # Delete the filtered BAM files to save space. for x in jobs: filelib.assert_exists_nz(x.bam_filename) os.unlink(x.bam_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import genomelib from genomicode import config from Betsy import module_utils as mlib fasta_node, bam_node, sample_node, orient_node = antecedents fasta_data = mlib.find_merged_fastq_files(sample_node.identifier, fasta_node.identifier, find_fasta=True) bam_filenames = mlib.find_bam_files(bam_node.identifier) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) # TODO: Try to figure out version. metadata = {} metadata["tool"] = "RSeQC (unknown version)" pyrseqc = mlib.findbin("pyrseqc") gene_model = mlib.get_user_option(user_options, "gene_model", not_empty=True, allowed_values=["hg19"]) if gene_model == "hg19": gene_path = config.rseqc_hg19 else: raise AssertionError, "Unhandled: %s" % gene_model filelib.dir_exists(gene_path) gene_model_bed = os.path.join(gene_path, "RefSeq.bed12") housekeeping_model_bed = os.path.join(gene_path, "HouseKeepingGenes.bed") sample2fastadata = {} for x in fasta_data: sample, f1, f2 = x sample2fastadata[sample] = x is_paired = orient.orientation.startswith("paired") # Guess the read length. Read the first fasta. assert sample2fastadata x = sample2fastadata.keys()[0] filename = sample2fastadata[x][1] lengths = {} # length -> count for i, x in enumerate(genomelib.read_fasta_many(filename)): if i >= 100: break title, sequence = x l = len(sequence) lengths[l] = lengths.get(l, 0) + 1 # Use the most common length. c_length = c_count = None for (l, c) in lengths.iteritems(): if c_count is None or c > c_count: c_length, c_count = l, c assert c_length read_length = c_length jobs = [] # sample, bam_filename, fasta_file1, fasta_file2, outdir for bam_filename in bam_filenames: # <path>/<sample>.bam p, sample, e = mlib.splitpath(bam_filename) assert sample in sample2fastadata x, f1, f2 = sample2fastadata[sample] outdir = os.path.join(out_path, sample) x = sample, bam_filename, f1, f2, outdir jobs.append(x) # Some of the modules of RSeQC uses a lot of memory. Have # seen a Python process take 33 Gb, and an R process take 200 # Gb. However, most of the modules use much less memory. So # run one pyrseqc at a time, and run each one of those # processes in parallel. Is probably slower than running # multiple pyrseqc, but takes less memory. commands = [] for x in jobs: sample, bam_filename, fasta_filename1, fasta_filename2, outdir = x # pyrseqc.py -j 20 --paired_end rqc11.bam rqc14.fa 76 \ # mod07.txt hg19.HouseKeepingGenes.bed rqc21 --dry_run x = [ mlib.sq(pyrseqc), "-j", str(num_cores), ] if is_paired: x += ["--paired_end"] x += [ mlib.sq(bam_filename), mlib.sq(fasta_filename1), str(read_length), mlib.sq(gene_model_bed), mlib.sq(housekeeping_model_bed), mlib.sq(outdir), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores # pyrseqc takes up to ~40 Gb per process. # read_distribution.py takes 33 Gb. # read_quality.py spins off an R process that takes ~200 Gb. # Make sure we don't use up more memory than is available on # the machine. #nc = mlib.calc_max_procs_from_ram(60, upper_max=num_cores) #metadata["num cores"] = nc #x = parallel.pshell(commands, max_procs=nc) # Because of memory, just run one at a time, but each one, use # multiple cores. for cmd in commands: x = parallel.sshell(cmd) assert x.find("Traceback") < 0, x filelib.assert_exists_nz(out_path) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib MAX_CORES = 4 # I/O intensive. fastq_node, sample_node, bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier, as_dict=True) metadata = {} jobs = [] # list of (sample, bam_file, fastq_file) for filename in bam_filenames: path, sample, ext = mlib.splitpath(filename) assert sample in sample2fastq, "Missing fastq: %s" % sample fastq1, fastq2 = sample2fastq[sample] x = sample, filename, fastq1 jobs.append(x) funcalls = [] for x in jobs: sample, bam_filename, fastq_filename = x # Count the number of reads. x1 = count_reads, (fastq_filename, ), {} # Count the number of alignments. x2 = count_alignments, (bam_filename, ), {} funcalls.append(x1) funcalls.append(x2) assert len(funcalls) == len(jobs) * 2 nc = min(num_cores, MAX_CORES) results = parallel.pyfun(funcalls, num_procs=nc) metadata["num_cores"] = nc # list of (sample, aligns, aligned_reads, total_reads, perc_aligned). results2 = [] for i, x in enumerate(jobs): sample, bam_filename, fastq_filename = x x1 = results[i * 2] x2 = results[i * 2 + 1] total_reads = x1 aligned_reads, alignments = x2 perc_aligned = float(aligned_reads) / total_reads x = sample, alignments, aligned_reads, total_reads, perc_aligned results2.append(x) results = results2 # sort by sample name results.sort() # Make table where the rows are the samples and the columns # are the statistics. table = [] header = ("Sample", "Alignments", "Aligned Reads", "Total Reads", "Perc Aligned") table.append(header) for x in results: sample, alignments, aligned_reads, total_reads, perc_aligned = x x1 = parselib.pretty_int(alignments) x2 = parselib.pretty_int(aligned_reads) x3 = parselib.pretty_int(total_reads) x4 = "%.2f%%" % (perc_aligned * 100) x = sample, x1, x2, x3, x4 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = mlib.findbin("txt2xls", quote=True) parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile)) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_filenames = mlib.find_bam_files(in_data.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bam2fastx (unknown version)" # Somehow bam2fastx doesn't work if there are spaces in the # filename. Make a temporary filename with no spaces, and # then rename it later. # Actually, may not be bam2fastx's fault. jobs = [] for i, bam_filename in enumerate(bam_filenames): p, f, e = mlib.splitpath(bam_filename) #bai_filename = alignlib.find_bai_file(bam_filename) #assert bai_filename, "Missing index for: %s" % bam_filename #temp_bam_filename = "%d.bam" % i #temp_bai_filename = "%d.bam.bai" % i #temp_fa_filename = "%d.fa" % i fa_filename = os.path.join(out_path, "%s.fa" % f) x = filelib.GenericObject( bam_filename=bam_filename, #bai_filename=bai_filename, #temp_bam_filename=temp_bam_filename, #temp_bai_filename=temp_bai_filename, #temp_fa_filename=temp_fa_filename, fa_filename=fa_filename) jobs.append(x) bam2fastx = mlib.findbin("bam2fastx") # Link all the bam files. #for j in jobs: # assert not os.path.exists(j.temp_bam_filename) # #assert not os.path.exists(j.temp_bai_filename) # os.symlink(j.bam_filename, j.temp_bam_filename) # #os.symlink(j.bai_filename, j.temp_bai_filename) commands = [] for j in jobs: # bam2fastx -A --fasta -o rqc14.fa rqc11.bam x = [ mlib.sq(bam2fastx), "-A", "--fasta", #"-o", mlib.sq(j.temp_fa_filename), #mlib.sq(j.temp_bam_filename), "-o", mlib.sq(j.fa_filename), mlib.sq(j.bam_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_fa_filename, j.fa_filename) # # Remove the link to the BAM file. # os.unlink(j.temp_bam_filename) x = [j.fa_filename for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_node = in_data summary_filename = summary_node.identifier metadata = {} buildver = mlib.get_user_option(user_options, "annovar_buildver", allowed_values=["hg19"], not_empty=True) # Name files. p, root, ext = mlib.splitpath(summary_filename) annovar_infile = "pos.txt" log_filename = "annovar.log" # Annovar takes a filestem, without the ".vcf". annovar_outstem = "annotations" # Produces file: # <annovar_outstem>.hg19_multianno.txt multianno_file = "%s.hg19_multianno.txt" % annovar_outstem #temp_file = "temp.txt" # Make the infile for Annovar. # <chrom> <start> <end> <ref> <alt> handle = open(annovar_infile, 'w') for d in filelib.read_row(summary_filename, skip=2, header=1): x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt print >> handle, "\t".join(x) handle.close() cmd = alignlib.make_annovar_command(annovar_infile, log_filename, annovar_outstem, buildver, vcf_input=False) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(log_filename) filelib.assert_exists_nz(multianno_file) matrix = SimpleVariantMatrix.read(summary_filename) annot_matrix = matrix.annot_matrix #headers = annot_matrix.headers + anno_header[5:] chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] # Read in the multianno output file. pos2d = {} # (chrom, start, ref, alt) -> d anno_header = None for d in filelib.read_row(multianno_file, header=1): key = d.Chr, int(d.Start), d.Ref, d.Alt assert key not in pos2d, "Duplicate pos: %s" % str(key) pos2d[key] = d if not anno_header: anno_header = d._header assert anno_header # Multianno starts with: # Chr Start End Ref Alt # Ignore these. assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"] headers = anno_header[5:] all_annots = [] #for h in annot_matrix.headers_h: # x = annot_matrix.header2annots[h] # all_annots.append(x) for i in range(5, len(anno_header)): annots = [] for coord in zip(chrom, pos, ref, alt): d = pos2d.get(coord) x = "" if d: x = d._cols[i] annots.append(x) all_annots.append(annots) x = AnnotationMatrix.create_from_annotations(headers, all_annots) matrix.named_matrices.insert(0, ("Annovar", x)) SimpleVariantMatrix.write(out_filename, matrix) ## cols_to_add = len(anno_header) - 5 ## assert cols_to_add > 0 ## # Merge the multianno file with the simple call summary. Add ## # these columns before the <Sample>. ## # Sample <Sample> ## # Caller <Caller> ## # Chrom Pos Ref Alt Ref/Alt/VAF ## handle = open(temp_file, 'w') ## it = filelib.read_cols(summary_filename) ## header1 = it.next() ## header2 = it.next() ## header3 = it.next() ## assert len(header1) == len(header2), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert len(header1) == len(header3), "%d %d %d %s" % ( ## len(header1), len(header2), len(header3), summary_filename) ## assert header1[0] == "Sample" ## assert header2[0] == "Caller" ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"] ## header1 = header1[:4] + [""]*cols_to_add + header1[4:] ## header2 = header2[:4] + [""]*cols_to_add + header2[4:] ## header3 = header3[:4] + anno_header[5:] + header3[4:] ## print >>handle, "\t".join(header1) ## print >>handle, "\t".join(header2) ## print >>handle, "\t".join(header3) ## for cols in it: ## chrom, pos, ref, alt = cols[:4] ## pos = int(pos) ## d = pos2d.get((chrom, pos)) ## if not d: ## cols = cols[:4] + [""]*cols_to_add + cols[4:] ## continue ## assert ref == d.Ref, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## assert alt == d.Alt, "%s %s %s %s %s %s" % ( ## chrom, pos, ref, alt, d.Ref, d.Alt) ## x = d._cols[5:] ## assert len(x) == cols_to_add ## cols = cols[:4] + x + cols[4:] ## print >>handle, "\t".join(cols) ## handle.close() ## shutil.move(temp_file, out_filename) return metadata
def merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path): # Put indexed files in tmp_path. import os import stat import shutil from genomicode import filelib from genomicode import hashlib from genomicode import parallel from Betsy import module_utils as mlib # TODO: find the version number of these tools. bgzip = mlib.findbin("bgzip") tabix = mlib.findbin("tabix") bcftools = mlib.findbin("bcftools") sq = parallel.quote tmp_path = os.path.realpath(tmp_path) filelib.safe_mkdir(tmp_path) # Keep track of all commands run. metadata = {} metadata["commands"] = [] # Ignore VCF files that don't have any variants. vcf_filenames = [x for x in vcf_filenames if os.stat(x)[stat.ST_SIZE] > 0] # If there are no VCF files with any variants, then just create an # empty outfile and return. if not vcf_filenames: open(out_filename, 'w') return # 1. Copy VCF files to temporary directory. tmp_filename # 2. Fix VCF files (e.g. NextGENe, JointSNVMix broken) # 3. Sort the VCF files (needed for tabix) # 4. Compress (bgzip) # 5. Index (tabix) # 6. Merge jobs = [] for in_filename in vcf_filenames: path, root, ext = mlib.splitpath(in_filename) sample = root x = "%s%s" % (hashlib.hash_var(root), ext) tmp_filename = os.path.join(tmp_path, x) x = filelib.GenericObject( sample=sample, in_filename=in_filename, tmp_filename=tmp_filename, ) jobs.append(x) # Make sure temporary files are unique. seen = {} for j in jobs: assert j.tmp_filename not in seen seen[j.tmp_filename] = 1 # Merge them in order of sample. The germline sample will be # duplicated, and we will know the order of the germline sample. schwartz = [(x.sample, x) for x in jobs] schwartz.sort() jobs = [x[-1] for x in schwartz] # Copy all the VCF files to a temporary directory. for j in jobs: shutil.copy2(j.in_filename, j.tmp_filename) #for j in jobs: # make_file_smaller(j.tmp_filename, 1000) for j in jobs: # NextGENe creates broken VCF files. Fix them. fix_nextgene_vcf(j.tmp_filename) # JointSNVMix creates broken VCF files. Fix them. fix_jointsnvmix_vcf(j.tmp_filename) for j in jobs: sort_vcf_file(j.tmp_filename) ## # Since we are merging the files, we need to make sure that ## # each file has a unique name. If the names aren't unique, ## # then make them unique by adding the name of the file. ## all_unique = True ## seen = {} ## for x in jobs: ## sample, in_filename, tmp_filename = x ## samples = _get_samples_from_vcf(tmp_filename) ## for s in samples: ## if s in seen: ## all_unique = False ## break ## seen[s] = 1 ## if not all_unique: ## break ## if not all_unique: ## for x in jobs: ## sample, in_filename, tmp_filename = x ## _uniquify_samples_in_vcf(tmp_filename, sample) # Compress the VCF files. # bgzip file.vcf commands = [] for j in jobs: x = "%s %s" % (sq(bgzip), sq(j.tmp_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores, path=tmp_path) metadata["commands"].extend(commands) metadata["num_cores"] = num_cores x = ["%s.gz" % x.tmp_filename for x in jobs] filelib.assert_exists_nz_many(x) # Index the VCF files. # tabix -p vcf file.vcf.gz commands = [] for j in jobs: x = "%s -p vcf %s.gz" % (sq(tabix), sq(j.tmp_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores, path=tmp_path) metadata["commands"].extend(commands) x = ["%s.gz.tbi" % j.tmp_filename for j in jobs] filelib.assert_exists_nz_many(x) # Run bcftools ## For VCF files from somatic calls, the germline sample will ## be duplicated. Add --force-samples to make sure this is ## still merged. # Since we need to append all the VCF files, it's easy to run # into error: # OSError: [Errno 7] Argument list too long # # To reduce the chance of this, figure out the path of the # tmp_filename, and run the analysis in that path so we can # use relative filenames. tmp_path = None for j in jobs: path, file_ = os.path.split(j.tmp_filename) if tmp_path is None: tmp_path = path assert path == tmp_path cmd = [ sq(bcftools), "merge", "-o %s" % sq(out_filename), "-O v", "--force-samples", ] for j in jobs: path, file_ = os.path.split(j.tmp_filename) assert path == tmp_path cmd.append("%s.gz" % file_) x = " ".join(cmd) parallel.sshell(x, path=tmp_path) metadata["commands"].append(x) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # java -jar picard.jar CollectAlignmentSummaryMetrics \ # R=reference_sequence.fasta \ # I=input.bam \ # O=output.txt opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, bam_filename=bam_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # Make the commands to run picard. picard_jar = alignlib.find_picard_jar("picard") sq = parallel.quote commands = [] for j in jobs: # Should have better way of getting java path. cmd = [ "java", "-Xmx10g", "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics", "I=%s" % sq(j.bam_filename), "R=%s" % sq(ref.fasta_file_full), "O=%s" % sq(j.out_filename), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, sq(j.log_filename)) commands.append(cmd) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Summarize the insert size files. outfile = opj(out_path, "summary.txt") _summarize_alignment_summary_metrics(jobs, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import vcflib from Betsy import module_utils as mlib vcf_node, nc_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Filenames: # <caller>.vcf wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) genome = mlib.get_user_option(user_options, "snpeff_genome", not_empty=True) databases = list_snpeff_databases() assert genome in databases, "Unknown genome database: %s" % genome # For each caller, do the SnpEFF calls. Some callers include # the somatic information, others do not. If germline samples # are present, then do with _cancer. Otherwise, do not. # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt # GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log # Don't bother annotating positions that do not pass filter. # Filter them out first based on FILTER column. opj = os.path.join jobs = [] for in_filename in vcf_filenames: path, stem, ext = mlib.splitpath(in_filename) samples_file = opj(out_path, "%s.cancerSamples.txt" % stem) filtered_filename = opj(out_path, "%s.filtered_input" % stem) out_filename = opj(out_path, "%s.vcf" % stem) log_filename = opj(out_path, "%s.log" % stem) x = filelib.GenericObject(in_filename=in_filename, samples_file=samples_file, filtered_filename=filtered_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # First, filter each of the VCF files. commands = [] for j in jobs: # For debugging. If this file exists, don't filter it again. if os.path.exists(j.filtered_filename): continue args = j.in_filename, j.filtered_filename, wgs_or_wes x = vcflib.filter_vcf_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Make the cancer_samples files. for j in jobs: # Will generate this if there are cancer samples. make_cancer_samples_file(j.filtered_filename, nc_match, j.samples_file) # Make a list of commands. commands = [] for j in jobs: cancer = False if os.path.exists(j.samples_file): cancer = True x = make_snpeff_command(j.filtered_filename, genome, j.out_filename, j.log_filename, is_cancer=cancer, cancer_samples_file=j.samples_file) commands.append(x) nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_cores"] = nc # Make sure the analysis completed successfully. x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Log files should be empty. for j in jobs: filelib.assert_exists(j.log_filename) assert not filelib.exists_nz(j.log_filename), \ "Error with %s.\n%s" % (j.stem, j.log_filename) filelib.safe_unlink(j.log_filename) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out MuTect version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals"] cosmic_file = mlib.get_user_option( user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option( user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile, # coverage_outfile, vcf_outfile, logfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) call_outfile = opj(out_path, "%s.call_stats.out" % sample) cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample) raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile jobs.append(x) # java -Xmx2g -jar muTect.jar # --analysis_type MuTect # --reference_sequence <reference> # --cosmic <cosmic.vcf> # --dbsnp <dbsnp.vcf> # --intervals <intervals_to_process> # --input_file:normal <normal.bam> # --input_file:tumor <tumor.bam> # --out <call_stats.out> # --coverage_file <coverage.wig.txt> # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x UNHASHABLE = [ ("input_file:normal", sq(normal_bamfile)), ("input_file:tumor", sq(cancer_bamfile)), ] x = alignlib.make_MuTect_command( analysis_type="MuTect", reference_sequence=sq(ref.fasta_file_full), cosmic=sq(cosmic_file), dbsnp=sq(dbsnp_file), intervals=sq(interval_node.identifier), out=sq(call_outfile), coverage_file=sq(cov_outfile), vcf=sq(raw_vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, x in enumerate(jobs): normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x # Pull out the error lines. x = [x for x in open(log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect error [%s]:\n%s\n%s" % ( cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x[6] for x in jobs] filelib.assert_exists_many(x) # Fix the files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x alignlib.clean_mutect_vcf( normal_bamfile, cancer_bamfile, normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import config from Betsy import module_utils as mlib mpileup_node = in_data mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier, endswith=".pileup") assert mpileup_filenames, "No .pileup files." #nc_match = mlib.read_normal_cancer_file(nc_node.identifier) #ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] tool = "mpileup2snp" if vartype == "indel": tool = "mpileup2indel" # list of (sample, in_filename, tmp1_filename, tmp2_filename, # out_filename) jobs = [] for in_filename in mpileup_filenames: p, sample, ext = mlib.splitpath(in_filename) tmp1_filename = os.path.join(out_path, "%s.tmp1" % sample) tmp2_filename = os.path.join(out_path, "%s.tmp2" % sample) out_filename = os.path.join(out_path, "%s.vcf" % sample) x = sample, in_filename, tmp1_filename, tmp2_filename, out_filename jobs.append(x) # VarScan will generate a "Parsing Exception" if there are 0 # reads in a location. Filter those out. sq = parallel.quote commands = [] for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x x = "awk -F'\t' '$4 != 0 {print}' %s > %s" % (in_filename, tmp1_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) x = [x[2] for x in jobs] filelib.assert_exists_nz_many(x) # java -jar /usr/local/bin/VarScan.jar <tool> $i --output_vcf 1 > $j varscan = filelib.which_assert(config.varscan_jar) # Make a list of commands. commands = [] for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x x = [ "java", "-jar", sq(varscan), tool, tmp1_filename, "--p-value", 0.05, "--output-vcf", 1, ] x = " ".join(map(str, x)) x = "%s >& %s" % (x, tmp2_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) x = [x[3] for x in jobs] filelib.assert_exists_nz_many(x) # Clean up the VCF files. VarScan leaves extraneous lines # there. for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x alignlib.clean_varscan_vcf(sample, tmp2_filename, out_filename) x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) # The tmp files are really big. Don't save those. for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x filelib.safe_unlink(tmp1_filename) filelib.safe_unlink(tmp2_filename)