def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils bam_filenames = module_utils.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.matches.txt" % s) x = in_filename, out_filename jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: in_filename, out_filename = x x = summarize_bam_file, (in_filename, out_filename), None jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores, DELAY=0.1) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_node, gene_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) assert bam_filenames, "No bam files found." metadata = {} # Make output filenames. p, r, e = mlib.splitpath(gtf_file) bed_file = "%s.bed" % r # Make bed file. alignlib.gtf_to_bed(gtf_file, bed_file) #bed_file = "/data/jchang/biocore/gtf02.txt" # Figure out the orientation. x = get_paired_stranded_rseqc(bed_file, bam_filenames[0]) single_or_paired, stranded, frac_failed, frac_first, frac_second = x x = mlib.Stranded(single_or_paired, stranded, frac_failed, frac_first, frac_second) mlib.write_stranded(x, outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] for bam_filename in bam_filenames: x = count_duplicates, (bam_filename,), {} jobs.append(x) results = parallel.pyfun(jobs, num_procs=num_cores) metadata["num_cores"] = num_cores assert len(results) == len(bam_filenames) handle = open(outfile, 'w') header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated" print >>handle, "\t".join(header) for i in range(len(bam_filenames)): x, sample, x = mlib.splitpath(bam_filenames[i]) total_reads, dup_reads = results[i] perc_dup = float(dup_reads) / total_reads * 100 perc_dup = "%.2f" % perc_dup x = sample, dup_reads, total_reads, perc_dup print >>handle, "\t".join(map(str, x)) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib MAX_RAM = 64 # maximum amount of ram to use in Gb. bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = in_filename, log_filename, out_filename jobs.append(x) # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar # -T SplitNCigarReads -R ../hg19.fa -I $i -o $j # -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 # -U ALLOW_N_CIGAR_READS # Start with 5 Gb RAM. commands = make_commands(jobs, ref.fasta_file_full, 5) nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_procs"] = nc # If any of the analyses didn't finish, try again with more # RAM. jobs2 = [] for x in jobs: in_filename, log_filename, out_filename = x if filelib.exists_nz(out_filename): continue jobs2.append(x) if jobs2: commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM) nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] += commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = in_filename, err_filename, out_filename jobs.append(x) # samtools mpileup -f [reference sequence] [BAM file(s)] # > myData.mpileup samtools = mlib.findbin("samtools") sq = mlib.sq commands = [] for x in jobs: in_filename, err_filename, out_filename = x x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] x.append(sq(in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, err_filename, out_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node = antecedents #in_filenames = filelib.list_files_in_path( # bam_node.identifier, endswith=".bam", case_insensitive=True) in_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \ # I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta picard_jar = alignlib.find_picard_jar("picard") jobs = [] # list of (in_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) # Make a list of commands. sq = parallel.quote commands = [] for x in jobs: in_filename, out_filename = x x = [ "java", "-Xmx5g", "-jar", sq(picard_jar), "ReorderSam", "I=%s" % sq(in_filename), "O=%s" % sq(out_filename), "REFERENCE=%s" % ref.fasta_file_full, ] x = " ".join(x) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: in_filename, out_filename = x filelib.assert_exists_nz(out_filename)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils align_node = in_data x = module_utils.find_bam_files(align_node.identifier) x = [x for x in x if x.endswith("accepted_hits.bam")] bam_filenames = x assert bam_filenames, "No accepted_hits.bam files." filelib.safe_mkdir(out_path) jobs = [] # list of (in_filename, out_filename) for in_filename in bam_filenames: # Names must in the format: # <path>/<sample>.tophat/accepted_hits.bam # full_path <path>/<sample>.tophat # path <path> # tophat_dir <sample>.tophat # file_ accepted_hits.bam # sample <sample> full_path, file_ = os.path.split(in_filename) path, tophat_dir = os.path.split(full_path) assert file_ == "accepted_hits.bam" assert tophat_dir.endswith(".tophat") sample = tophat_dir[:-7] out_filename = os.path.join(out_path, "%s.bam" % sample) assert in_filename != out_filename jobs.append((in_filename, out_filename)) # Make sure outfiles are unique. x = [x[-1] for x in jobs] x = {}.fromkeys(x) assert len(jobs) == len(x), "Duplicate sample names." for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. ## Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] jobs = [] for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) #raw_outfile = os.path.join(out_path, "%s.raw" % sample) vcf_outfile = os.path.join(out_path, "%s.vcf" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(bam_filename=bam_filename, vcf_outfile=vcf_outfile, log_filename=log_filename) jobs.append(x) # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar # -T HaplotypeCaller -R ucsc.hg19.fasta # -dontUseSoftClippedBases -stand_call_conf 20.0 # -stand_emit_conf 20.0 -I $i -o $j # Make a list of commands. commands = [] for j in jobs: # For debugging. If exists, don't do it again. #if filelib.exists_nz(j.raw_outfile): if filelib.exists_nz(j.vcf_outfile): continue x = alignlib.make_GATK_command(T="HaplotypeCaller", R=ref.fasta_file_full, dontUseSoftClippedBases=None, stand_call_conf=20.0, stand_emit_conf=20.0, I=j.bam_filename, o=j.vcf_outfile) x = "%s >& %s" % (x, j.log_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Filter each of the VCF files. #for j in jobs: # filter_by_vartype(vartype, j.raw_outfile, j.vcf_outfile) #metadata["filter"] = vartype # Make sure the analysis completed successfully. x = [j.vcf_outfile for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals" ] cosmic_file = mlib.get_user_option(user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option(user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(normal_sample=normal_sample, cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, vcf_outfile=vcf_outfile, log_outfile=log_outfile) jobs.append(x) # java -jar GenomeAnalysisTK.jar \ # -T MuTect2 \ # -R reference.fasta \ # -I:tumor tumor.bam \ # -I:normal normal.bam \ # [--dbsnp dbSNP.vcf] \ # [--cosmic COSMIC.vcf] \ # [-L targets.interval_list] \ # -o output.vcf # Generate the commands. sq = mlib.sq commands = [] for j in jobs: UNHASHABLE = [ ("I:normal", sq(normal_bamfile)), ("I:tumor", sq(cancer_bamfile)), # --dbsnp and --cosmic use two dashes, for some # reason. Since make_GATK_command only uses one dash, # add one manually. ("-dbsnp", sq(dbsnp_file)), ("-cosmic", sq(cosmic_file)), ] x = alignlib.make_GATK_command( T="MuTect2", R=sq(ref.fasta_file_full), L=sq(interval_node.identifier), o=sq(j.vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, j.log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, j in enumerate(jobs): # Pull out the error lines. x = [x for x in open(j.log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x.vcf_outfile for x in jobs] filelib.assert_exists_many(x) # Mutect2 names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for j in jobs: call_somatic_varscan._fix_normal_cancer_names( j.vcf_outfile, j.normal_sample, j.cancer_sample) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents in_filenames = mlib.find_bam_files(bam_node.identifier) assert in_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) f, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % f) out_filename = os.path.join(out_path, "%s.intervals" % f) x = in_filename, log_filename, out_filename jobs.append(x) filter_reads_with_N_cigar = mlib.get_user_option( user_options, "filter_reads_with_N_cigar", allowed_values=["no", "yes"]) known_sites = [] x1 = mlib.get_user_option(user_options, "realign_known_sites1", check_file=True) x2 = mlib.get_user_option(user_options, "realign_known_sites2", check_file=True) x3 = mlib.get_user_option(user_options, "realign_known_sites3", check_file=True) x = [x1, x2, x3] x = [x for x in x if x] known_sites = x assert known_sites # I/O bound, so not likely to get a big speedup with nt. # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4 # -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j # --known <known_vcf_file> # RealignerTargetCreator takes ~10Gb per process. Each thread # takes the full amount of memory. nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores) # Make a list of commands. commands = [] for x in jobs: in_filename, log_filename, out_filename = x n = max(1, nc / len(jobs)) x = [("-known", x) for x in known_sites] if filter_reads_with_N_cigar == "yes": x.append(("-filter_reads_with_N_cigar", None)) x = alignlib.make_GATK_command(nt=n, T="RealignerTargetCreator", R=ref.fasta_file_full, I=in_filename, o=out_filename, _UNHASHABLE=x) x = "%s >& %s" % (x, log_filename) commands.append(x) parallel.pshell(commands, max_procs=nc) metadata["num_procs"] = nc metadata["commands"] = commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # vcf_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile jobs.append(x) # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # test31/tumor.bam test31/normal.bam test41.vcf somaticsniper = mlib.get_config("somaticsniper", which_assert_file=True) # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x x = [ sq(somaticsniper), "-q", 1, "-Q", 15, "-G", "-L", "-F", "vcf", "-f", sq(ref.fasta_file_full), sq(cancer_bamfile), sq(normal_bamfile), sq(vcf_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # SomaticSniper names the samples "NORMAL" and "TUMOR". # Replace them with the actual names. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x call_somatic_varscan._fix_normal_cancer_names( vcf_outfile, normal_sample, cancer_sample) x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # java -jar picard.jar CollectAlignmentSummaryMetrics \ # R=reference_sequence.fasta \ # I=input.bam \ # O=output.txt opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, bam_filename=bam_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # Make the commands to run picard. picard_jar = alignlib.find_picard_jar("picard") sq = parallel.quote commands = [] for j in jobs: # Should have better way of getting java path. cmd = [ "java", "-Xmx10g", "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics", "I=%s" % sq(j.bam_filename), "R=%s" % sq(ref.fasta_file_full), "O=%s" % sq(j.out_filename), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, sq(j.log_filename)) commands.append(cmd) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Summarize the insert size files. outfile = opj(out_path, "summary.txt") _summarize_alignment_summary_metrics(jobs, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib # For debugging. RUN_VARIANT_CALLING = True FILTER_CALLS = True MERGE_CALLS = True FIX_VCF_FILES = True dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier) assert dna_bam_filenames, "No DNA .bam files." rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier) assert rna_bam_filenames, "No RNA .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "Radia %s" % alignlib.get_radia_version() ## Make sure the BAM files do not contain spaces in the ## filenames. Radia doesn't work well with spaces. #filenames = dna_bam_filenames + rna_bam_filenames #has_spaces = [] #for filename in filenames: # if filename.find(" ") >= 0: # has_spaces.append(filename) #x = has_spaces #if len(x) > 5: # x = x[:5] + ["..."] #x = ", ".join(x) #msg = "Radia breaks if there are spaces in filenames: %s" % x #assert not has_spaces, msg # sample -> bam filename dnasample2bamfile = mlib.root2filename(dna_bam_filenames) rnasample2bamfile = mlib.root2filename(rna_bam_filenames) # Make sure files exist for all the samples. The DNA-Seq # should have both normal and cancer. RNA is not needed for # normal sample. mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile) mlib.assert_normal_cancer_samples(nc_match, rnasample2bamfile, ignore_normal_sample=True) # Make sure Radia and snpEff are configured. radia_genome_assembly = mlib.get_user_option(user_options, "radia_genome_assembly", not_empty=True) assert radia_genome_assembly == "hg19", "Only hg19 handled." snp_eff_genome = mlib.get_user_option(user_options, "snp_eff_genome", not_empty=True) radia_path = mlib.get_config("radia_path", assert_exists=True) snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True) radia_files = get_radia_files(radia_path, radia_genome_assembly) # Make a list of the chromosomes to use. Pick an arbitrarily # BAM file. Look at only the chromosomes that are present in # all files. all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values() chroms = list_common_chromosomes(all_bamfiles) assert chroms, "No chromosomes found in all files." # Only use the chromosomes that can be filtered by Radia. chroms = filter_radia_chromosomes(chroms, radia_files) # Make output directories. radia_outpath = "radia1.tmp" filter_outpath = "radia2.tmp" merge_outpath = "radia3.tmp" if not os.path.exists(radia_outpath): os.mkdir(radia_outpath) if not os.path.exists(filter_outpath): os.mkdir(filter_outpath) if not os.path.exists(merge_outpath): os.mkdir(merge_outpath) # Steps: # 1. Call variants (radia.py) # -o <file.vcf> # 2. Filter variants (filterRadia.py) # <outpath> # Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf # 3. Merge (mergeChroms.py) # Takes as input: <filter_outpath> # Produces: <merge_outpath>/<patient_id>.vcf # list of (normal_sample, cancer_sample, chrom, # normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, # radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, # final_vcf_outfile, # radia_logfile, filter_logfile, merge_logfile) opj = os.path.join jobs = [] for i, (normal_sample, cancer_sample) in enumerate(nc_match): normal_bamfile = dnasample2bamfile[normal_sample] dna_tumor_bamfile = dnasample2bamfile[cancer_sample] rna_tumor_bamfile = rnasample2bamfile[cancer_sample] merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample) merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample) final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) for chrom in chroms: radia_vcf_outfile = opj( radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) filter_vcf_outfile = opj( filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) radia_logfile = opj(radia_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) filter_logfile = opj(filter_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) x = normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs.append(x) # Since Radia doesn't work well if there are spaces in the # filenames, symlink these files here to guarantee that there # are no spaces. normal_path = "normal.bam" dna_path = "dna.bam" rna_path = "rna.bam" if not os.path.exists(normal_path): os.mkdir(normal_path) if not os.path.exists(dna_path): os.mkdir(dna_path) if not os.path.exists(rna_path): os.mkdir(rna_path) for i, x in enumerate(jobs): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path) x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path) x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path) clean_normal, clean_dna, clean_rna = x1, x2, x3 x = normal_sample, cancer_sample, chrom, \ clean_normal, clean_dna, clean_rna, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs[i] = x # Generate the commands for doing variant calling. python = mlib.get_config("python", which_assert_file=True) # filterRadia.py calls the "blat" command, and there's no way # to set the path. Make sure "blat" is executable. if not filelib.which("blat"): # Find "blat" in the configuration and add it to the path. x = mlib.get_config("blat", which_assert_file=True) path, x = os.path.split(x) if os.environ["PATH"]: path = "%s:%s" % (os.environ["PATH"], path) os.environ["PATH"] = path # Make sure it's findable now. filelib.which_assert("blat") # STEP 1. Call variants with radia.py. # python radia.py test31 5 \ # -n bam04/PIM001_G.bam \ # -t bam04/196B-MG.bam \ # -r bam34/196B-MG.bam \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # -o test32.vcf # --dnaTumorMitochon MT \ # --rnaTumorMitochon MT \ sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.radia_py), cancer_sample, chrom, "-n", sq(normal_bamfile), "-t", sq(dna_tumor_bamfile), "-r", sq(rna_tumor_bamfile), "-f", sq(ref.fasta_file_full), "-o", radia_vcf_outfile, ] if "MT" in chroms: x += [ "--dnaNormalMitochon MT", "--dnaTumorMitochon MT", "--rnaTumorMitochon MT", ] x = " ".join(x) x = "%s >& %s" % (x, radia_logfile) commands.append(x) assert len(commands) == len(jobs) # Only uses ~200 Mb of ram. if RUN_VARIANT_CALLING: parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure log files are empty. logfiles = [x[10] for x in jobs] filelib.assert_exists_z_many(logfiles) # STEP 2. Filter variants with filterRadia.py. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.filterRadia_py), cancer_sample, chrom, sq(radia_vcf_outfile), sq(filter_outpath), sq(radia_files.scripts_dir), "-b", sq(radia_files.blacklist_dir), "-d", sq(radia_files.snp_dir), "-r", sq(radia_files.retro_dir), "-p", sq(radia_files.pseudo_dir), "-c", sq(radia_files.cosmic_dir), "-t", sq(radia_files.target_dir), "-s", sq(snp_eff_path), "-e", snp_eff_genome, "--rnaGeneBlckFile", sq(radia_files.rnageneblck_file), "--rnaGeneFamilyBlckFile", sq(radia_files.rnagenefamilyblck_file), ] x = " ".join(x) x = "%s >& %s" % (x, filter_logfile) commands.append(x) assert len(commands) == len(jobs) # Sometimes samtools crashes in the middle of a run. Detect # this case, and re-run the analysis if needed. assert len(commands) == len(jobs) py_commands = [] for x, cmd in zip(jobs, commands): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = cmd, cancer_sample, chrom, filter_logfile x = _run_filterRadia_with_restart, args, {} py_commands.append(x) # Takes ~10 Gb each. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) if FILTER_CALLS: parallel.pyfun(py_commands, num_procs=nc) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[11] for x in jobs] filelib.assert_exists_z_many(logfiles) # Make sure filter_vcf_outfile exists. outfiles = [x[7] for x in jobs] filelib.assert_exists_nz_many(outfiles) # STEP 3. Merge the results. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \ # radia2.tmp/ radia3.tmp # The "/" after radia2.tmp is important. If not given, # will generate some files with only newlines. fo = filter_outpath if not fo.endswith("/"): fo = "%s/" % fo x = [ sq(python), sq(radia_files.mergeChroms_py), cancer_sample, fo, merge_outpath, ] x = " ".join(x) x = "%s >& %s" % (x, merge_logfile) commands.append(x) assert len(commands) == len(jobs) # Since the chromosomes were separated for the previous steps, # this will generate one merge for each chromosome. This is # unnecessary, since we only need to merge once per sample. # Get rid of duplicates. commands = sorted({}.fromkeys(commands)) if MERGE_CALLS: parallel.pshell(commands, max_procs=num_cores) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[12] for x in jobs] logfiles = sorted({}.fromkeys(logfiles)) filelib.assert_exists_z_many(logfiles) # Fix the VCF files. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = normal_sample, cancer_sample, \ merge_vcf_outfile, final_vcf_outfile x = alignlib.clean_radia_vcf, args, {} commands.append(x) if FIX_VCF_FILES: parallel.pyfun(commands, num_procs=num_cores) # Make sure output VCF files exist. x = [x[9] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib MAX_CORES = 4 # I/O intensive. fastq_node, sample_node, bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier, as_dict=True) metadata = {} jobs = [] # list of (sample, bam_file, fastq_file) for filename in bam_filenames: path, sample, ext = mlib.splitpath(filename) assert sample in sample2fastq, "Missing fastq: %s" % sample fastq1, fastq2 = sample2fastq[sample] x = sample, filename, fastq1 jobs.append(x) funcalls = [] for x in jobs: sample, bam_filename, fastq_filename = x # Count the number of reads. x1 = count_reads, (fastq_filename, ), {} # Count the number of alignments. x2 = count_alignments, (bam_filename, ), {} funcalls.append(x1) funcalls.append(x2) assert len(funcalls) == len(jobs) * 2 nc = min(num_cores, MAX_CORES) results = parallel.pyfun(funcalls, num_procs=nc) metadata["num_cores"] = nc # list of (sample, aligns, aligned_reads, total_reads, perc_aligned). results2 = [] for i, x in enumerate(jobs): sample, bam_filename, fastq_filename = x x1 = results[i * 2] x2 = results[i * 2 + 1] total_reads = x1 aligned_reads, alignments = x2 perc_aligned = float(aligned_reads) / total_reads x = sample, alignments, aligned_reads, total_reads, perc_aligned results2.append(x) results = results2 # sort by sample name results.sort() # Make table where the rows are the samples and the columns # are the statistics. table = [] header = ("Sample", "Alignments", "Aligned Reads", "Total Reads", "Perc Aligned") table.append(header) for x in results: sample, alignments, aligned_reads, total_reads, perc_aligned = x x1 = parselib.pretty_int(alignments) x2 = parselib.pretty_int(aligned_reads) x3 = parselib.pretty_int(total_reads) x4 = "%.2f%%" % (perc_aligned * 100) x = sample, x1, x2, x3, x4 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = mlib.findbin("txt2xls", quote=True) parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile)) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node, insert_size_node, alignment_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # ./pindel -f <reference.fa> -i <bam_configuration_file> # -c <chromosome_name> -o <out_prefix> # -T <num threads> # # Creates files: # <out_prefix>_D Deletion # <out_prefix>_SI Short insertion # <out_prefix>_LI Long insertion # <out_prefix>_INV Inversion # <out_prefix>_TD Tandem deletion # <out_prefix>_BP Breakpoint # <out_prefix>_RP ??? read pair??? # <out_prefix>_CloseEndMapped Only on end could be mapped. # Pindel cannot handle spaces in the BAM filenames (because of # the config file). Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = opj(bam_path, "%s.bam" % x) local_bai = opj(bam_path, "%s.bam.bai" % x) config_filename = opj(out_path, "%s.config.txt" % sample) out_prefix = opj(out_path, sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, config_filename=config_filename, out_prefix=out_prefix, log_filename=log_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # Read the insert sizes. summary_file = opj(insert_size_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2size = _read_insert_sizes(summary_file) # Make sure all the samples have inserts. for j in jobs: assert j.sample in sample2size, \ "Missing in insert size file: %s" % j.sample # Read the fragment sizes. summary_file = opj(alignment_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2readlen = _read_fragment_sizes(summary_file) # Make sure all the samples have read lengths. for j in jobs: assert j.sample in sample2readlen, \ "Missing in alignment summary file: %s" % j.sample # Make the config file. for j in jobs: # <insert size> is the whole length to be sequenced, including # the length of the pair of reads. Picard only counts the # sequence between the reads. size = sample2size[j.sample] read_length = sample2readlen[j.sample] insert_size = size + read_length * 2 handle = open(j.config_filename, 'w') print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample) handle.close() # Make a list of commands. pindel = mlib.get_config("pindel", which_assert_file=True) sq = parallel.quote commands = [] for j in jobs: cmd = [ sq(pindel), "-f", sq(ref.fasta_file_full), "-i", sq(j.config_filename), "-c", "ALL", "-T", 1, "-o", sq(j.out_prefix), ] cmd = " ".join(map(str, cmd)) cmd = "%s >& %s" % (cmd, j.log_filename) commands.append(cmd) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure the analysis completed successfully. If not, try # to diagnose. x = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x) x1 = ["%s_D" % x.out_prefix for x in jobs] x2 = ["%s_SI" % x.out_prefix for x in jobs] x3 = ["%s_LI" % x.out_prefix for x in jobs] x4 = ["%s_INV" % x.out_prefix for x in jobs] x5 = ["%s_TD" % x.out_prefix for x in jobs] x6 = ["%s_BP" % x.out_prefix for x in jobs] x = x1 + x2 + x3 + x4 + x5 + x6 filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "MuSE %s" % alignlib.get_muse_version() wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) dbsnp_file = mlib.get_user_option(user_options, "muse_dbsnp_vcf", not_empty=True, check_file=True) # Make sure dbsnp_file is compressed and indexed. assert dbsnp_file.endswith(".vcf.gz"), \ "muse_dbsnp_vcf must be bgzip compressed." x = "%s.tbi" % dbsnp_file assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed." # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, # logfile1, logfile2) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) muse_call_stem = opj(out_path, "%s.call" % cancer_sample) muse_call_file = "%s.MuSE.txt" % muse_call_stem raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample) vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample) log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 jobs.append(x) # Generate the commands. # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\ # bam04/196B-MG.bam bam04/PIM001_G.bam # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \ # -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz MuSE = mlib.findbin("muse") sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "call", "-O", muse_call_stem, "-f", sq(ref.fasta_file_full), cancer_bamfile, normal_bamfile, ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile1) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[8] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the call files are created and not empty. call_files = [x[5] for x in jobs] filelib.assert_exists_nz_many(call_files) # Run the "sump" step. commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "sump", "-I", sq(muse_call_file), ] assert wgs_or_wes in ["wgs", "wes"] if wgs_or_wes == "wgs": x += ["-G"] else: x += ["-E"] x += [ "-O", sq(raw_vcf_outfile), "-D", sq(dbsnp_file), ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile2) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = metadata["commands"] + commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[9] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the raw files are created and not empty. vcf_files = [x[6] for x in jobs] filelib.assert_exists_nz_many(vcf_files) # Fix the files. commands = [] # Should be python commands. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile x = alignlib.clean_muse_vcf, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Delete the log_outfiles if empty. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x if os.path.exists(log_outfile1): os.unlink(log_outfile1) if os.path.exists(log_outfile2): os.unlink(log_outfile2) # Make sure output VCF files exist. x = [x[7] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import ngslib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} features_bed = mlib.get_user_option(user_options, "features_bed", check_file=True) if features_bed: metadata["features_bed"] = features_bed # Applies to genomecov. min_coverage = user_options.get("ignore_coverage_below") if min_coverage == "": min_coverage = None if min_coverage is not None: min_coverage = int(min_coverage) assert min_coverage >= 0 metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version() metadata["num_cores"] = num_cores metadata["commands"] = [] # Set up the filenames. # list of ( # sample, # orig_bam_filename, Original bam filename. # bam_filename, bam file, after filtering out unmapped reads. # genomecov_filename, Generated by genomecov. Histogram. # histo_datafile, Data file to generate histogram (from cov). # histo_plotfile, Histogram plot. # histo_prismfile, To make histogram in PRISM. # # ONLY USED IF features_bed # intervallist_file, Made from BED file. # cov_filename, Generated by Picard. # targetcov_filename, Generated by Picard. Per target coverage. # log_filename, Output from Picard. # ) opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" clean_bam_filename = opj(out_path, "%s.bam" % sample) assert clean_bam_filename != bam_filename genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample) histo_datafile = opj(out_path, "%s.histo.txt" % sample) histo_plotfile = opj(out_path, "%s.histo.png" % sample) histo_prismfile = opj(out_path, "%s.prism.txt" % sample) intervallist_file = opj(out_path, "%s.interval.txt" % sample) cov_filename = opj(out_path, "%s.coverage.txt" % sample) targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample) log_filename = opj(out_path, "%s.picard.log" % sample) x = filelib.GenericObject(sample=sample, orig_bam_filename=bam_filename, bam_filename=clean_bam_filename, genomecov_filename=genomecov_filename, histo_datafile=histo_datafile, histo_plotfile=histo_plotfile, histo_prismfile=histo_prismfile, intervallist_file=intervallist_file, cov_filename=cov_filename, targetcov_filename=targetcov_filename, log_filename=log_filename) #x = sample, bam_filename, genomecov_filename, \ # histo_datafile, histo_plotfile, histo_prismfile, \ # intervallist_file, cov_filename, targetcov_filename, \ # log_filename jobs.append(x) # Remove unmapped reads from the BAM files. # Need to remove the unmapped reads or Picard might complain: # Exception in thread "main" # htsjdk.samtools.SAMFormatException: SAM validation error: # ERROR: Record 154286082, Read name # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0 # for unmapped read. # # This can happen with BWA generated alignments. cmds = [] for x in jobs: x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename) cmds.append(x) parallel.pshell(cmds, max_procs=num_cores) x = [x.bam_filename for x in jobs] filelib.assert_exists_nz_many(x) # Generate the intervallist_file(s). if features_bed: cmds = [] for x in jobs: args = x.intervallist_file, features_bed, x.bam_filename x = _make_intervallist_file, args, {} cmds.append(x) parallel.pyfun(cmds, num_procs=num_cores) # Make the commands to run picard. if features_bed: commands = [] for x in jobs: x = _make_calculatehsmetrics_command( x.intervallist_file, x.bam_filename, x.cov_filename, x.targetcov_filename, ref.fasta_file_full, x.log_filename) commands.append(x) metadata["commands"].append(commands) parallel.pshell(commands, max_procs=num_cores) x1 = [x.cov_filename for x in jobs] x2 = [x.targetcov_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) # Use genomecov to count read depth. x = _run_genomecov(jobs, ref_node.identifier, num_cores) metadata["commands"].append(x) # Summarize the average read depth. summary_file = opj(out_path, "summary.xls") _summarize_average_read_depth(jobs, min_coverage, summary_file) # Make histograms of the distribution of the read depth for # each sample. for x in jobs: _make_histo_file(x.genomecov_filename, x.histo_datafile) # Delete the filtered BAM files to save space. for x in jobs: filelib.assert_exists_nz(x.bam_filename) os.unlink(x.bam_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import genomelib from genomicode import config from Betsy import module_utils as mlib fasta_node, bam_node, sample_node, orient_node = antecedents fasta_data = mlib.find_merged_fastq_files(sample_node.identifier, fasta_node.identifier, find_fasta=True) bam_filenames = mlib.find_bam_files(bam_node.identifier) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) # TODO: Try to figure out version. metadata = {} metadata["tool"] = "RSeQC (unknown version)" pyrseqc = mlib.findbin("pyrseqc") gene_model = mlib.get_user_option(user_options, "gene_model", not_empty=True, allowed_values=["hg19"]) if gene_model == "hg19": gene_path = config.rseqc_hg19 else: raise AssertionError, "Unhandled: %s" % gene_model filelib.dir_exists(gene_path) gene_model_bed = os.path.join(gene_path, "RefSeq.bed12") housekeeping_model_bed = os.path.join(gene_path, "HouseKeepingGenes.bed") sample2fastadata = {} for x in fasta_data: sample, f1, f2 = x sample2fastadata[sample] = x is_paired = orient.orientation.startswith("paired") # Guess the read length. Read the first fasta. assert sample2fastadata x = sample2fastadata.keys()[0] filename = sample2fastadata[x][1] lengths = {} # length -> count for i, x in enumerate(genomelib.read_fasta_many(filename)): if i >= 100: break title, sequence = x l = len(sequence) lengths[l] = lengths.get(l, 0) + 1 # Use the most common length. c_length = c_count = None for (l, c) in lengths.iteritems(): if c_count is None or c > c_count: c_length, c_count = l, c assert c_length read_length = c_length jobs = [] # sample, bam_filename, fasta_file1, fasta_file2, outdir for bam_filename in bam_filenames: # <path>/<sample>.bam p, sample, e = mlib.splitpath(bam_filename) assert sample in sample2fastadata x, f1, f2 = sample2fastadata[sample] outdir = os.path.join(out_path, sample) x = sample, bam_filename, f1, f2, outdir jobs.append(x) # Some of the modules of RSeQC uses a lot of memory. Have # seen a Python process take 33 Gb, and an R process take 200 # Gb. However, most of the modules use much less memory. So # run one pyrseqc at a time, and run each one of those # processes in parallel. Is probably slower than running # multiple pyrseqc, but takes less memory. commands = [] for x in jobs: sample, bam_filename, fasta_filename1, fasta_filename2, outdir = x # pyrseqc.py -j 20 --paired_end rqc11.bam rqc14.fa 76 \ # mod07.txt hg19.HouseKeepingGenes.bed rqc21 --dry_run x = [ mlib.sq(pyrseqc), "-j", str(num_cores), ] if is_paired: x += ["--paired_end"] x += [ mlib.sq(bam_filename), mlib.sq(fasta_filename1), str(read_length), mlib.sq(gene_model_bed), mlib.sq(housekeeping_model_bed), mlib.sq(outdir), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores # pyrseqc takes up to ~40 Gb per process. # read_distribution.py takes 33 Gb. # read_quality.py spins off an R process that takes ~200 Gb. # Make sure we don't use up more memory than is available on # the machine. #nc = mlib.calc_max_procs_from_ram(60, upper_max=num_cores) #metadata["num cores"] = nc #x = parallel.pshell(commands, max_procs=nc) # Because of memory, just run one at a time, but each one, use # multiple cores. for cmd in commands: x = parallel.sshell(cmd) assert x.find("Traceback") < 0, x filelib.assert_exists_nz(out_path) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile, # fixed_outfile, filtered_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) orig_outfile = opj(out_path, "%s.raw" % sample) fix_outfile = opj(out_path, "%s.vcf" % sample) #filter_outfile = opj(out_path, "%s.vcf" % sample) x = cancer_sample, normal_bamfile, cancer_bamfile, \ orig_outfile, fix_outfile x = filelib.GenericObject(cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, orig_outfile=orig_outfile, fix_outfile=fix_outfile) jobs.append(x) # python /usr/local/museq/classify.py \ # normal:test31/normal.bam tumour:test31/tumor.bam \ # reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # model:/usr/local/museq/model_v4.1.2.npz \ # --config /usr/local/museq/metadata.config \ # -o test51.vcf opj = os.path.join museq = mlib.get_config("museq", assert_exists=True) classify_py = opj(museq, "classify.py") model_file = opj(museq, "model_v4.1.2.npz") config_file = opj(museq, "metadata.config") filelib.assert_exists_nz(classify_py) filelib.assert_exists_nz(model_file) filelib.assert_exists_nz(config_file) # museq's config file generates a broken VCF file. Fix it. fixed_config_file = "fixed.config" fix_config_file(config_file, fixed_config_file) # Generate the commands. sq = mlib.sq commands = [] for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x x = [ "python", # should allow user to specify python sq(classify_py), sq("normal:%s" % j.normal_bamfile), sq("tumour:%s" % j.cancer_bamfile), sq("reference:%s" % ref.fasta_file_full), sq("model:%s" % model_file), "--config", sq(fixed_config_file), "-o", sq(j.orig_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. On Thunderbolts test, # took < 1 Gb. nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # JointSNVMix produces non-standard VCF files. Fix this so it # will work with other programs downstream. for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile) # Filter each of the VCF files. #for x in jobs: # cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x # filter_by_vartype(vartype, fix_outfile, vcf_outfile) #metadata["filter"] = vartype #x = [x[-1] for x in jobs] x = [j.fix_outfile for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_filenames = mlib.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} #in_path = module_utils.unzip_if_zip(in_data.identifier) #x = filelib.list_files_in_path(in_path) #x = [x for x in x if x.lower().endswith(".bam")] #in_filenames = x #assert in_filenames, "No .bam files." jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = in_filename, log_filename, out_filename jobs.append(x) # java -Xmx5g -jar MarkDuplicates.jar # I=<input.sam or .bam> O=<output.bam> # METRICS_FILE=metricsFile CREATE_INDEX=true # VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true #picard_jar = module_utils.find_picard_jar("MarkDuplicates") picard_jar = alignlib.find_picard_jar("picard") # Make a list of commands. sq = parallel.quote commands = [] for x in jobs: in_filename, log_filename, out_filename = x x = [ "java", "-Xmx20g", "-jar", sq(picard_jar), "MarkDuplicates", "I=%s" % sq(in_filename), "O=%s" % sq(out_filename), "METRICS_FILE=metricsFile", #"CREATE_INDEX=true", "VALIDATION_STRINGENCY=LENIENT", #"REMOVE_DUPLICATES=true", "REMOVE_DUPLICATES=false", # BAM files should be sorted. # Actuallly, DEPRECATED now. #"ASSUME_SORTED=true", "TMP_DIR=%s" % sq(out_path), ] x = " ".join(x) x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) # java may use additional threads for garbage collection. # https://sourceforge.net/p/picard/wiki/Main_Page/ # Takes ~10 Gb per process (with -Xmx5g). Increased the RAM # used to 20 Gb because 5 Gb was not enough for some files. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_cores"] = nc # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node, target_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." target_filenames = filelib.list_files_in_path(target_node.identifier, endswith=".intervals") assert target_filenames, "No .intervals files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) assert len(bam_filenames) == len(target_filenames), \ "Should have an .intervals file for each bam file." sample2bamfilename = {} for filename in bam_filenames: p, f = os.path.split(filename) sample, ext = os.path.splitext(f) assert sample not in sample2bamfilename sample2bamfilename[sample] = filename sample2targetfilename = {} for filename in target_filenames: p, f = os.path.split(filename) sample, ext = os.path.splitext(f) assert sample not in sample2targetfilename sample2targetfilename[sample] = filename assert len(sample2bamfilename) == len(sample2targetfilename) missing = [ x for x in sample2bamfilename if x not in sample2targetfilename ] assert not missing, "Missing interval files for %d bam files." % \ len(missing) # list of (bam_filename, target_filename, log_filename, out_filename) jobs = [] for sample in sample2bamfilename: bam_filename = sample2bamfilename[sample] target_filename = sample2targetfilename[sample] p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = bam_filename, target_filename, log_filename, out_filename jobs.append(x) known_sites = [] x1 = module_utils.get_user_option(user_options, "realign_known_sites1", check_file=True) x2 = module_utils.get_user_option(user_options, "realign_known_sites2", check_file=True) x3 = module_utils.get_user_option(user_options, "realign_known_sites3", check_file=True) x = [x1, x2, x3] x = [x for x in x if x] known_sites = x assert known_sites # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar \ # -T IndelRealigner -R <ref.fa> \ # -I <bam_file> -targetIntervals <target_file> -o <bam_file> # Make a list of commands. commands = [] for x in jobs: bam_filename, target_filename, log_filename, out_filename = x x = [("known", x) for x in known_sites] x = alignlib.make_GATK_command(T="IndelRealigner", R=ref.fasta_file_full, I=bam_filename, targetIntervals=target_filename, o=out_filename, _UNHASHABLE=x) x = "%s >& %s" % (x, log_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out Strelka version. skip_depth_filter = False x = mlib.get_user_option(user_options, "strelka_skip_depth_filter", allowed_values=["no", "yes"], not_empty=True) if x == "yes": skip_depth_filter = True assert "vartype" in out_attributes, "Missing attribute: vartype" x = out_attributes["vartype"] assert x in ["snp", "indel"] vartype = x # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # Make sure each cancer sample is unique. Otherwise, the # analysis directories will conflict. tumor_samples = [x[-1] for x in nc_match] dups = {} for i in range(1, len(tumor_samples)): if tumor_samples[i] in tumor_samples[:i]: dups[tumor_samples[i]] = 1 assert not dups, "NormalCancerFile contains multiple instances of: %s"\ % ", ".join(sorted(dups)) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # config_file, output_dir opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) config_file = opj(out_path, "config.%s.ini" % cancer_sample) analysis_path = opj(out_path, "analysis.%s" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path jobs.append(x) # Make each of the config files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x _make_config_file(config_file, skip_depth_filter=skip_depth_filter) # Make the analysis directories. jobs2 = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x fn = _make_analysis_directory args = (analysis_path, config_file, ref.fasta_file_full, normal_bamfile, cancer_bamfile) keywds = None jobs2.append((fn, args, keywds)) parallel.pyfun(jobs2, num_procs=num_cores) # Run the analysis. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x cmd = "make -j %d" % num_cores parallel.sshell(cmd, path=analysis_path) metadata["num_cores"] = num_cores # Make sure files exists. x = [x[-1] for x in jobs] x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x] filelib.assert_exists_nz_many(x) # Clean the VCF files and save into the out_path. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ config_file, analysis_path = x # <analysis_path>/results/all.somatic.snvs.vcf # <analysis_path>/results/all.somatic.indels.vcf vartype2file = { "snp": "all.somatic.snvs.vcf", "indel": "all.somatic.indels.vcf", } assert vartype in vartype2file x = vartype2file[vartype] src_file = os.path.join(analysis_path, "results", x) dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample) alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file, dst_file) #metadata["commands"] = commands return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils ## Importing pysam is hard! #import sys #sys_path_old = sys.path[:] #sys.path = [x for x in sys.path if x.find("RSeQC") < 0] #import pysam #sys.path = sys_path_old bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) assert in_filename != out_filename x = in_filename, log_filename, out_filename jobs.append(x) # Don't do this. Need MD, NM, NH in # summarize_alignment_cigar. To be sure, just redo it. ## If the files already have MD tags, then just symlink the ## files. Don't add again. #i = 0 #while i < len(jobs): # in_filename, out_filename = jobs[i] # # handle = pysam.AlignmentFile(in_filename, "rb") # align = handle.next() # tag_dict = dict(align.tags) # if "MD" not in tag_dict: # i += 1 # continue # # Has MD tags. Just symlink and continue. # os.symlink(in_filename, out_filename) # del jobs[i] # Make a list of samtools commands. # Takes ~200 Mb per process, so should not be a big issue. samtools = filelib.which_assert(config.samtools) sq = parallel.quote commands = [] for x in jobs: in_filename, log_filename, out_filename = x # samtools calmd -b <in.bam> <ref.fasta> > <out.bam> # May generate error: # [bam_fillmd1] different NM for read # 'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19 # Pipe stderr to different file. x = [ samtools, "calmd", "-b", sq(in_filename), sq(ref.fasta_file_full), ] x = " ".join(x) x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x)
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \ # -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA" # -t <gtf_file> >& <log_filename>" # <out_path> Output directory. Will be created if not exists. # <in_filename> BAM file # <reference_file> /data/biocore/genomes/UCSC/mm10.fa # <gtf_file> /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf # # <reference_file> must be indexed and have a dict file. rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar) GTF = module_utils.get_user_option( user_options, "rna_seqc_gtf_file", not_empty=True) assert os.path.exists(GTF), "File not found: %s" % GTF # list of infile, out_path, ref_file, gtf_file, sample, log_file jobs = [] for in_filename in bam_filenames: p, file_ = os.path.split(in_filename) f, e = os.path.splitext(file_) sample = hashlib.hash_var(f) out_path_rna_seqc = os.path.join(out_path, sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \ sample, log_filename jobs.append(x) sq = parallel.quote commands = [] for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x x = [sample, in_filename, "NA"] x = "|".join(x) x = [ 'java', '-jar', rna_seqc_jar, '-o', sq(out_path_rna_seqc), '-r', sq(ref_filename), '-s', "'%s'" % x, '-t', gtf_filename, ] x = " ".join(x) cmd = "%s >& %s" % (x, log_filename) commands.append(cmd) # Gets lots of errors. x = parallel.pshell(commands, max_procs=num_cores) run_log = os.path.join(out_path, "run.log") open(run_log, 'w').write(x) # Check for outfile. # Make sure the analysis completed successfully. for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x filelib.assert_exists_nz(out_path_rna_seqc)
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_filenames = mlib.find_bam_files(in_data.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bam2fastx (unknown version)" # Somehow bam2fastx doesn't work if there are spaces in the # filename. Make a temporary filename with no spaces, and # then rename it later. # Actually, may not be bam2fastx's fault. jobs = [] for i, bam_filename in enumerate(bam_filenames): p, f, e = mlib.splitpath(bam_filename) #bai_filename = alignlib.find_bai_file(bam_filename) #assert bai_filename, "Missing index for: %s" % bam_filename #temp_bam_filename = "%d.bam" % i #temp_bai_filename = "%d.bam.bai" % i #temp_fa_filename = "%d.fa" % i fa_filename = os.path.join(out_path, "%s.fa" % f) x = filelib.GenericObject( bam_filename=bam_filename, #bai_filename=bai_filename, #temp_bam_filename=temp_bam_filename, #temp_bai_filename=temp_bai_filename, #temp_fa_filename=temp_fa_filename, fa_filename=fa_filename) jobs.append(x) bam2fastx = mlib.findbin("bam2fastx") # Link all the bam files. #for j in jobs: # assert not os.path.exists(j.temp_bam_filename) # #assert not os.path.exists(j.temp_bai_filename) # os.symlink(j.bam_filename, j.temp_bam_filename) # #os.symlink(j.bai_filename, j.temp_bai_filename) commands = [] for j in jobs: # bam2fastx -A --fasta -o rqc14.fa rqc11.bam x = [ mlib.sq(bam2fastx), "-A", "--fasta", #"-o", mlib.sq(j.temp_fa_filename), #mlib.sq(j.temp_bam_filename), "-o", mlib.sq(j.fa_filename), mlib.sq(j.bam_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_fa_filename, j.fa_filename) # # Remove the link to the BAM file. # os.unlink(j.temp_bam_filename) x = [j.fa_filename for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib #from genomicode import hashlib from Betsy import module_utils in_filenames = module_utils.find_bam_files(in_data.identifier) assert in_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] #seen = {} for i, in_filename in enumerate(in_filenames): p, f = os.path.split(in_filename) temp_prefix = "temp_%s" % f #temp_prefix = "temp_%s" % hashlib.hash_var(f) # Make sure no duplicates. #assert temp_prefix not in seen #seen[temp_prefix] = 1 #temp_outfilename = "%d.bam" % i out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, temp_prefix=temp_prefix, #temp_outfilename=temp_outfilename, out_filename=out_filename) jobs.append(x) samtools = filelib.which_assert(config.samtools) # Calculate the number of threads per process. nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores) num_threads = max(nc / len(jobs), 1) # Make a list of samtools commands. # Without -m, takes ~1 Gb per process. sq = parallel.quote commands = [] for j in jobs: # Usage has changed. Below no longer valid. # samtools sort <in_filename> <out_filestem> # .bam automatically added to <out_filestem>, so don't # need it. #x = out_filename #assert x.endswith(".bam") #x = x[:-4] #out_filestem = x x = [ sq(samtools), "sort", "-O", "bam", "-T", sq(j.temp_prefix), "-m", "4G", # Crashing, so try increasing memory. sq(j.in_filename), #"-o", sq(j.temp_outfilename), "-o", sq(j.out_filename), ] if num_threads > 1: x += ["-@", num_threads] x = " ".join(map(str, x)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = nc parallel.pshell(commands, max_procs=nc) #for cmd in commands: # parallel.sshell(cmd) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_outfilename, j.out_filename) # Make sure the analysis completed successfully. x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out MuTect version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals"] cosmic_file = mlib.get_user_option( user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option( user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile, # coverage_outfile, vcf_outfile, logfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) call_outfile = opj(out_path, "%s.call_stats.out" % sample) cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample) raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile jobs.append(x) # java -Xmx2g -jar muTect.jar # --analysis_type MuTect # --reference_sequence <reference> # --cosmic <cosmic.vcf> # --dbsnp <dbsnp.vcf> # --intervals <intervals_to_process> # --input_file:normal <normal.bam> # --input_file:tumor <tumor.bam> # --out <call_stats.out> # --coverage_file <coverage.wig.txt> # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x UNHASHABLE = [ ("input_file:normal", sq(normal_bamfile)), ("input_file:tumor", sq(cancer_bamfile)), ] x = alignlib.make_MuTect_command( analysis_type="MuTect", reference_sequence=sq(ref.fasta_file_full), cosmic=sq(cosmic_file), dbsnp=sq(dbsnp_file), intervals=sq(interval_node.identifier), out=sq(call_outfile), coverage_file=sq(cov_outfile), vcf=sq(raw_vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, x in enumerate(jobs): normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x # Pull out the error lines. x = [x for x in open(log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect error [%s]:\n%s\n%s" % ( cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x[6] for x in jobs] filelib.assert_exists_many(x) # Fix the files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x alignlib.clean_mutect_vcf( normal_bamfile, cancer_bamfile, normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils bam_node, ref_node, pos_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Positions file has 0-based coordinates (like BAM files). # But samtools requires 1-based coordinates. Convert to # 1-based coordinates. positions_filename = "positions.txt" outhandle = open(positions_filename, 'w') for x in filelib.read_cols(pos_node.identifier): assert len(x) == 2 chrom, pos = x pos = int(pos) + 1 # convert from 0- to 1-based coords. x = chrom, pos print >> outhandle, "\t".join(map(str, x)) outhandle.close() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = filelib.GenericObject(in_filename=in_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) ## Get possible positions file. #positions_filename = module_utils.get_user_option( # user_options, "positions_file", check_file=True) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel", "consensus"] #if cov == "yes": # assert positions_filename, "Missing: positions_file" # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \ # $i > $j" samtools = filelib.which_assert(config.samtools) # Get an error if the BAM files are not indexed. # [W::bam_hdr_read] EOF marker is absent. The input is probably # truncated. #if vartype == "consensus": # args = [ # "-R", # Ignore read group tags. # "-B", # Disable BAQ (base quality) computation. # "-q", 0, # Skip bases with mapQ smaller than this. # "-Q", 0, # Skip bases with BAQ smaller than this. # "-d10000000", # Allow deep reads. # ] #else: # raise NotImplementedError args = [ "-R", # Ignore read group tags. "-B", # Disable BAQ (base quality) computation. "-q", 0, # Skip bases with mapQ smaller than this. "-Q", 0, # Skip bases with BAQ smaller than this. "-d10000000", # Allow deep reads. ] sq = parallel.quote commands = [] for j in jobs: x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] if positions_filename: x.extend(["-l", positions_filename]) x.extend(args) x.append(sq(j.in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename) commands.append(x) #for x in commands: # print x parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands # File may be empty if there are no reads. x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) # Make sure there's no errors in the log files. for j in jobs: check_log_file(j.err_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib #import call_variants_GATK bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # Platypus generates an error if there are spaces in the BAM # filename. Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = os.path.join(bam_path, "%s.bam" % x) local_bai = os.path.join(bam_path, "%s.bam.bai" % x) log_filename = os.path.join(out_path, "%s.log" % sample) err_filename = os.path.join(out_path, "%s.err" % sample) # Unfiltered file. #raw_filename = os.path.join(out_path, "%s.raw" % sample) # Final VCF file. out_filename = os.path.join(out_path, "%s.vcf" % sample) x = filelib.GenericObject(bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, log_filename=log_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # TODO: Keep better track of the metadata. buffer_size = 100000 max_reads = 5E6 # Running into errors sometimes, so increase these numbers. # WARNING - Too many reads (5000000) in region # 1:500000-600000. Quitting now. Either reduce --bufferSize or # increase --maxReads. buffer_size = buffer_size * 10 max_reads = max_reads * 10 # Make a list of commands. commands = [] for j in jobs: #nc = max(1, num_cores/len(jobs)) x = alignlib.make_platypus_command(bam_file=j.local_bam, ref_file=ref.fasta_file_full, log_file=j.log_filename, out_file=j.out_filename, buffer_size=buffer_size, max_reads=max_reads) x = "%s >& %s" % (x, j.err_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. If not, try # to diagnose. for j in jobs: if filelib.exists_nz(j.out_filename): continue for line in open(j.err_filename): if line.find("WARNING - Too many reads") >= 0: print line, x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) # Filter each of the VCF files. #for j in jobs: # call_variants_GATK.filter_by_vartype( # vartype, j.raw_filename, j.out_filename) #metadata["filter"] = vartype return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils bam_filenames = module_utils.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) sample = hashlib.hash_var(s) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, sample=sample, log_filename=log_filename, out_filename=out_filename) jobs.append(x) gid = "group1" library = "library" platform_unit = "platform" #sample = "sample" platform = "illumina" # java -Xmx5g -jar AddOrReplaceReadGroups.jar # I=<input.sam or .bam> O=<output.bam> ID=<group ID> # LB=<group library> PU=<platform unit> SM=<group sample name> # PL=<platform> CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT picard_jar = alignlib.find_picard_jar("picard") # Make a list of commands. sq = parallel.quote commands = [] for j in jobs: x = [ "java", "-Xmx5g", "-jar", sq(picard_jar), "AddOrReplaceReadGroups", "I=%s" % sq(j.in_filename), "O=%s" % sq(j.out_filename), "ID=%s" % gid, "LB=%s" % library, "PU=%s" % platform_unit, "SM=%s" % j.sample, "PL=%s" % platform, #"CREATE_INDEX=true", "VALIDATION_STRINGENCY=LENIENT", ] x = " ".join(x) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands metadata["num_cores"] = num_cores # Make sure the analysis completed successfully. # Make sure outfiles exist. out_filenames = [j.out_filename for x in jobs] filelib.assert_exists_nz_many(out_filenames) # Check the log files to make sure there are no error. for j in jobs: check_log_file(j.log_filename) return metadata