def make_peakseq_command(treat_filename, control_filename, outpath, experiment_name, fragment_length, mappability_file): import os from genomicode import config from genomicode import filelib from genomicode import parallel # pypeakseq.py --experiment_name EXPERIMENT_NAME # --fragment_length FRAGMENT_LENGTH # <mappability_file> <treatment_bam> <control_bam> <outpath> pypeakseq = filelib.which_assert(config.pypeakseq) assert os.path.exists(treat_filename) assert os.path.exists(control_filename) assert os.path.exists(mappability_file) assert fragment_length > 0 and fragment_length < 100000 sq = parallel.quote cmd = [ sq(pypeakseq), "--experiment_name", experiment_name, "--fragment_length", str(fragment_length), sq(mappability_file), sq(treat_filename), sq(control_filename), sq(outpath), ] return " ".join(cmd)
def make_pyspp_command(treat_filename, control_filename, outpath, num_procs=None): import os from genomicode import config from genomicode import filelib from genomicode import parallel assert num_procs is None or (num_procs >= 1 and num_procs < 256) # pyspp.py [-j NUM_PROCS] [--fdr_cutoff FDR_CUTOFF] # <treatment_bam> <control_bam> <outpath> pyspp = filelib.which_assert(config.pyspp) assert os.path.exists(treat_filename) assert os.path.exists(control_filename) sq = parallel.quote cmd = [ sq(pyspp), ] if num_procs: cmd += ["-j", str(num_procs)] cmd += [ sq(treat_filename), sq(control_filename), sq(outpath), ] return " ".join(cmd)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bwa = filelib.which_assert(config.bwa) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bwa index <out_stem.fa> # Makes files: # <out_stem>.fa.amb .ann .bwt .pac .sa sq = parallel.quote cmd = [ sq(bwa), "index", sq(ref.fasta_file_full), ] parallel.sshell(cmd, path=out_path) # Make sure the indexing worked properly. EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"] for ext in EXTENSIONS: f = "%s%s" % (ref.fasta_file_full, ext) assert filelib.exists_nz(f), "Missing: %s" % f
def make_macs2_command(treat_filename, control_filename=None, genome_size=None, name=None, save_bedgraph_file=False, broad_peak_calling=False, normalize_read_counts=False, paired=False): from genomicode import config from genomicode import filelib from genomicode import parallel assert genome_size in ["hs", "mm", "ce", "dm"] # Regular peak calling: # macs2 callpeak -t sample.bam -c control.bam \ # -f [BAM,BAMPE] -g hs -n T_53BP1 -B -q 0.01 # # Broad peak calling: # macs2 callpeak --broad -t sample.bam -c control.bam \ # -f [BAM,BAMPE] -g hs -n T_53BP1 --broad-cutoff 0.1 # # -n name. For saving output files. # -w Save extended fragment pileup at every WIGEXTEND bp in wiggle # file. # -B Save extended fragment pileup at every bp in a bedGraph file. # Much smaller than wiggle file. # --broad-cutoff q-value for merging broad regions. # --SPMR Normalize coverage plot by millions of reads. macs2 = filelib.which_assert(config.macs2) sq = parallel.quote cmd = [ sq(macs2), "callpeak", ] if broad_peak_calling: cmd += ["--broad"] if normalize_read_counts: cmd += ["--SPMR"] cmd += ["-t", sq(treat_filename)] if control_filename: cmd += [ "-c", sq(control_filename), ] format_ = "BAM" if paired: format_ = "BAMPE" cmd += [ "-f", format_, "-g", genome_size, ] if name: cmd.extend(["-n", sq(name)]) if save_bedgraph_file: cmd.append("-B") return " ".join(cmd)
def make_bedtools_genomecov_command(bam_filename, reference_file, cov_filename): import os import config import filelib import parallel # Generates a histogram of the counts for each read depth. # bedtools genomecov [OPTIONS] -ibam <align.bam> -g <ref.fa> bedtools = filelib.which_assert(config.bedtools) assert os.path.exists(bam_filename) assert os.path.exists(reference_file) sq = parallel.quote x = [ sq(bedtools), "genomecov", "-ibam", sq(bam_filename), "-g", sq(reference_file), ">&", sq(cov_filename), ] return " ".join(x)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from genomicode import config signal_node = in_data signal_file = signal_node.identifier assert os.path.exists(signal_file) slice_matrix = filelib.which_assert(config.slice_matrix) sq = parallel.quote cmd = [ sq(slice_matrix), "--cpm", signal_file, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile)
def make_bedtools_coverage_command(bam_filename, features_bed, cov_filename): import os import config import filelib import parallel # XXX Generates a histogram of the counts for each read depth. # bedtools coverage [OPTIONS] -abam <align.bam> -b <features.bed> bedtools = filelib.which_assert(config.bedtools) assert os.path.exists(bam_filename) assert os.path.exists(features_bed) sq = parallel.quote x = [ sq(bedtools), "coverage", "-abam", sq(bam_filename), "-b", sq(features_bed), ">&", sq(cov_filename), ] return " ".join(x)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bowtie2_build = filelib.which_assert(config.bowtie2_build) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bowtie2-build <ref.fa> <output_stem> # Makes files: # <output_stem>.[1234].bt2 # <output_stem>.rev.[12].bt2 sq = parallel.quote cmd = [ sq(bowtie2_build), sq(ref.fasta_file_full), ref.name, ] parallel.sshell(cmd, path=out_path) # Check to make sure index was created successfully. f = os.path.join(out_path, "%s.1.bt2" % ref.name) assert filelib.exists_nz(f)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import itertools from genomicode import config from genomicode import parallel from genomicode import filelib signal_node, annotation_node = antecedents signal_filename = signal_node.identifier annotation_filename = annotation_node.identifier filelib.assert_exists_nz(signal_filename) filelib.assert_exists_nz(annotation_filename) metadata = {} align_matrices = filelib.which_assert(config.align_matrices) # Make sure the signal_filename has an ID_REF header. header = filelib.read_cols(signal_filename).next() assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \ signal_filename signal_align_file = "signal.aligned.txt" annot_align_file = "annot.aligned.txt" # First, align the two files. sq = parallel.quote cmd = [ sq(align_matrices), "--annot_file", signal_filename, "--header", "ID_REF", "--annot_file", annotation_filename, "--left_join", signal_align_file, annot_align_file, ] cmd = " ".join(cmd) parallel.sshell(cmd) metadata["command"] = cmd # Now merge them. Take the first column of the expression # file (should be ID_REF), the whole annotation file, then the # remainder of the expression file. signal_handle = filelib.read_cols(signal_align_file) annot_handle = filelib.read_cols(annot_align_file) outhandle = open(outfile, 'w') for x1, x2 in itertools.izip(signal_handle, annot_handle): x = [x1[0]] + x2 + x1[1:] print >> outhandle, "\t".join(x) outhandle.close() #cmd = "paste %s %s > %s" % ( # annot_align_file, signal_align_file, outfile) #shell.single(cmd) filelib.assert_exists_nz(outfile)
def make_peakseq_preproc_command(bam_file, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel # samtools view bam11.bam | PeakSeq -preprocess SAM stdin bam12 samtools = filelib.which_assert(config.samtools) peakseq = filelib.which_assert(config.peakseq) sq = parallel.quote cmd = [ sq(samtools), "view", sq(bam_file), "|", sq(peakseq), "-preprocess", "SAM", "stdin", sq(out_path), ] return " ".join(cmd)
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bam_path = in_data.identifier assert os.path.exists(bam_path) assert os.path.isdir(bam_path) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() # Find all the BAM files. bam_filenames = filelib.list_files_in_path( bam_path, endswith=".bam", case_insensitive=True) jobs = [] # list of in_filename, out_filename for in_filename in bam_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) assert not os.path.exists(out_filename) x = in_filename, out_filename jobs.append(x) # Symlink the BAM files to the output path. for x in jobs: in_filename, out_filename = x os.symlink(in_filename, out_filename) # Index each of the files. sq = parallel.quote samtools = filelib.which_assert(config.samtools) commands = [] for x in jobs: in_filename, out_filename = x cmd = [ sq(samtools), "index", sq(out_filename), ] x = " ".join(cmd) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores, path=out_path) # TODO: Check for output files. return metadata
def get_bedtools_version(): import re from genomicode import config from genomicode import filelib from genomicode import parallel bedtools = filelib.which_assert(config.bedtools) x = parallel.sshell("%s --version" % bedtools, ignore_nonzero_exit=True) x = x.strip() # bedtools v2.23.0 # Version: 1.2 (using htslib 1.2.1) m = re.search(r"v([\w\. ]+)", x) assert m, "Missing version string" return m.group(1)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel log_filenames = _find_output_logs(in_data.identifier) assert log_filenames results = {} # dict of sample -> dictionary of output for filename in log_filenames: # <path>/<sample>.log path, file_ = os.path.split(filename) f, e = os.path.splitext(file_) assert e == ".log" sample = f results[sample] = alignlib.parse_bowtie1_output(filename) # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def get_config(name, which_assert_file=False, assert_exists=False, quote=False): from genomicode import filelib from genomicode import config assert hasattr(config, name), "Not configured for genomicode: %s" % name x = getattr(config, name) if which_assert_file: x = filelib.which_assert(x) elif assert_exists: filelib.assert_exists(x) if quote: x = sq(x) return x
def make_peakseq_run_command(config_file): import os from genomicode import config from genomicode import filelib from genomicode import parallel assert os.path.exists(config_file) config_file = os.path.realpath(config_file) # PeakSeq -peak_select <config_file> peakseq = filelib.which_assert(config.peakseq) sq = parallel.quote cmd = [ sq(peakseq), "-peak_select", config_file, ] return " ".join(cmd)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import config in_filename = in_data.identifier filelib.assert_exists_nz(in_filename) vcftools = filelib.which_assert(config.vcftools) # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all # --out test32 # Writes stuff to console. Should capture in log file. # Saves file test32.recode.vcf p, f = os.path.split(in_filename) s, ext = os.path.splitext(in_filename) sample = s out_stem = "%s.filtered" % sample log_filename = "%s.log" % sample # Should create file <out_stem>.recode.vcf outfile = "%s.recode.vcf" % out_stem sq = parallel.quote cmd = [ sq(vcftools), "--vcf", sq(in_filename), "--remove-indels", "--recode", "--recode-INFO-all", "--out", out_stem, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, log_filename) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) shutil.copy2(outfile, out_filename)
def _make_samtools_filter_cmd(in_bamfile, out_bamfile): from genomicode import filelib from genomicode import parallel from genomicode import config filelib.assert_exists_nz(in_bamfile) samtools = filelib.which_assert(config.samtools) sq = parallel.quote cmd = [ sq(samtools), "view", "-bF 4", sq(in_bamfile), ">", sq(out_bamfile), ] cmd = " ".join(cmd) return cmd
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib samtools = filelib.which_assert(config.samtools) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) ## fa_filenames = module_utils.find_fasta_files(out_path) ## # Filter out the FASTA files created by RSEM indexing. ## # <assembly>.idx.fa ## # <assembly>.n2g.idx.fa ## # <assembly>.transcripts.fa ## # Could these end with ".fasta"? ## x = fa_filenames ## x = [x for x in x if not x.endswith(".idx.fa")] ## x = [x for x in x if not x.endswith(".n2g.idx.fa")] ## x = [x for x in x if not x.endswith(".transcripts.fa")] ## fa_filenames = x ## assert fa_filenames, "Could not find reference genome." ## assert len(fa_filenames) == 1, "Found multiple reference genomes." ## reference_filename = fa_filenames[0] # samtools faidx <ref>.fa # Makes files: # <ref>.fa.fai sq = parallel.quote cmd = [ sq(samtools), "faidx", sq(ref.fasta_file_full), ] parallel.sshell(cmd, path=out_path) # Check to make sure index was created successfully. f = "%s.fai" % ref.fasta_file_full assert filelib.exists_nz(f)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from genomicode import config from genomicode import filelib #out_attributes = set_out_attributes(in_data, out_attributes) TCGA_BIN = filelib.which_assert(config.download_tcga) assert 'disease' in user_options command = [ 'python', TCGA_BIN, '--disease', user_options['disease'], '--data', out_attributes['preprocess'], '--download_only', ] if 'date' in user_options: command += ['--date', user_options['date']] # TODO: Need to return results from command. #shell.single(command) process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: raise ValueError(error_message) result_files = os.listdir(".") result_format = 'tar.gz' for result_file in result_files: if result_file.endswith(result_format): os.rename(result_file, outfile) assert filelib.exists_nz(outfile), ( 'the output file %s for download_tcga fails' % outfile)
def _make_intervallist_file(intervallist_file, features_bed, bam_filename): from genomicode import config from genomicode import filelib from genomicode import parallel outhandle = open(intervallist_file, 'w') # Add the @HD and @SQ headers from the bam file. # samtools view -H <filename> samtools = filelib.which_assert(config.samtools) sq = parallel.quote cmd = [ sq(samtools), "view", "-H", sq(bam_filename), ] cmd = " ".join(cmd) x = parallel.sshell(cmd) lines = x.split("\n") lines = [x.rstrip() for x in lines] for line in lines: if line.startswith("@HD") or line.startswith("@SQ"): print >> outhandle, line # Add the information from the BAM files. # BED chrom chromStart (0-based) chromEnd name score strand # Interval chrom chromStart (1-based) chromEnd strand name for cols in filelib.read_cols(features_bed): assert len(cols) >= 6 chrom, chromStart0, chromEnd, name, score, strand = cols[:6] chromStart0, chromEnd = int(chromStart0), int(chromEnd) chromStart1 = chromStart0 + 1 x = chrom, chromStart1, chromEnd, strand, name print >> outhandle, "\t".join(map(str, x)) outhandle.close()
def make_macs14_command(treat_filename, control_filename=None, genome_size=None, name=None, shiftsize=None, save_wiggle_file=False, save_single_wiggle_file=False, save_bedgraph_file=False, call_subpeaks=False): from genomicode import config from genomicode import filelib from genomicode import parallel assert genome_size in ["hs", "mm", "ce", "dm"] if call_subpeaks: save_wiggle_file = True save_bedgraph_file = False if shiftsize: assert shiftsize > 0 and shiftsize < 10000 #macs14 -t Sample_4_T_53BP1.sorted.bam -c Sample_8_T_input.sorted.bam \ # -g hs -n T_53BP1 -B -S --call-subpeaks >& T_53BP1.log # -n name. For saving output files. # -w Save extended fragment pileup at every WIGEXTEND bp in wiggle # file. # -B Save extended fragment pileup at every bp in a bedGraph file. # Much smaller than wiggle file. # -S A single wiggle file will be saved for treatment and input. # i.e. for whole genome, rather than for each chromosome. # --call_subpeaks Use PeakSplitter algorithm to find subpeaks. # -w needs to be on, and -B should be off. # # If estimated fragment size is too short (e.g. 53), then specify # your own fragment size. shiftsize is 1/2 of fragment size. # --nomodel --shiftsize 73 (for fragment size of 146) # Often fragment size is 150-200 for ChIP-Seq. macs14 = filelib.which_assert(config.macs14) sq = parallel.quote cmd = [ sq(macs14), "-t", sq(treat_filename), ] if control_filename: cmd += [ "-c", sq(control_filename), ] cmd += [ "-f", "BAM", "-g", genome_size, ] if name: cmd.extend(["-n", sq(name)]) if shiftsize: cmd.extend([ "--nomodel", "--shiftsize", str(shiftsize), ]) if save_wiggle_file: cmd.append("-w") if save_single_wiggle_file: cmd.append("-S") if save_bedgraph_file: cmd.append("-B") if call_subpeaks: cmd.append("--call_subpeaks") return " ".join(cmd)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) shiftsize = module_utils.get_user_option(user_options, "macs_shiftsize") if shiftsize: shiftsize = int(shiftsize) # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = find_bam_file(bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs14_command(treat_filename, control_filename, name=name, genome_size=genome_size, shiftsize=shiftsize, save_bedgraph_file=True) parallel.sshell(cmd, path=out_path) # Run Rscript on the model, if one was generated. model_file = os.path.join(out_path, "%s_model.r" % name) if os.path.exists(model_file): Rscript = filelib.which_assert(config.Rscript) cmd = [parallel.quote(Rscript), model_file] parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, "%s_summits.bed" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib # For debugging. RUN_VARIANT_CALLING = True FILTER_CALLS = True MERGE_CALLS = True FIX_VCF_FILES = True dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier) assert dna_bam_filenames, "No DNA .bam files." rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier) assert rna_bam_filenames, "No RNA .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "Radia %s" % alignlib.get_radia_version() ## Make sure the BAM files do not contain spaces in the ## filenames. Radia doesn't work well with spaces. #filenames = dna_bam_filenames + rna_bam_filenames #has_spaces = [] #for filename in filenames: # if filename.find(" ") >= 0: # has_spaces.append(filename) #x = has_spaces #if len(x) > 5: # x = x[:5] + ["..."] #x = ", ".join(x) #msg = "Radia breaks if there are spaces in filenames: %s" % x #assert not has_spaces, msg # sample -> bam filename dnasample2bamfile = mlib.root2filename(dna_bam_filenames) rnasample2bamfile = mlib.root2filename(rna_bam_filenames) # Make sure files exist for all the samples. The DNA-Seq # should have both normal and cancer. RNA is not needed for # normal sample. mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile) mlib.assert_normal_cancer_samples(nc_match, rnasample2bamfile, ignore_normal_sample=True) # Make sure Radia and snpEff are configured. radia_genome_assembly = mlib.get_user_option(user_options, "radia_genome_assembly", not_empty=True) assert radia_genome_assembly == "hg19", "Only hg19 handled." snp_eff_genome = mlib.get_user_option(user_options, "snp_eff_genome", not_empty=True) radia_path = mlib.get_config("radia_path", assert_exists=True) snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True) radia_files = get_radia_files(radia_path, radia_genome_assembly) # Make a list of the chromosomes to use. Pick an arbitrarily # BAM file. Look at only the chromosomes that are present in # all files. all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values() chroms = list_common_chromosomes(all_bamfiles) assert chroms, "No chromosomes found in all files." # Only use the chromosomes that can be filtered by Radia. chroms = filter_radia_chromosomes(chroms, radia_files) # Make output directories. radia_outpath = "radia1.tmp" filter_outpath = "radia2.tmp" merge_outpath = "radia3.tmp" if not os.path.exists(radia_outpath): os.mkdir(radia_outpath) if not os.path.exists(filter_outpath): os.mkdir(filter_outpath) if not os.path.exists(merge_outpath): os.mkdir(merge_outpath) # Steps: # 1. Call variants (radia.py) # -o <file.vcf> # 2. Filter variants (filterRadia.py) # <outpath> # Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf # 3. Merge (mergeChroms.py) # Takes as input: <filter_outpath> # Produces: <merge_outpath>/<patient_id>.vcf # list of (normal_sample, cancer_sample, chrom, # normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, # radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, # final_vcf_outfile, # radia_logfile, filter_logfile, merge_logfile) opj = os.path.join jobs = [] for i, (normal_sample, cancer_sample) in enumerate(nc_match): normal_bamfile = dnasample2bamfile[normal_sample] dna_tumor_bamfile = dnasample2bamfile[cancer_sample] rna_tumor_bamfile = rnasample2bamfile[cancer_sample] merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample) merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample) final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) for chrom in chroms: radia_vcf_outfile = opj( radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) filter_vcf_outfile = opj( filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) radia_logfile = opj(radia_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) filter_logfile = opj(filter_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) x = normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs.append(x) # Since Radia doesn't work well if there are spaces in the # filenames, symlink these files here to guarantee that there # are no spaces. normal_path = "normal.bam" dna_path = "dna.bam" rna_path = "rna.bam" if not os.path.exists(normal_path): os.mkdir(normal_path) if not os.path.exists(dna_path): os.mkdir(dna_path) if not os.path.exists(rna_path): os.mkdir(rna_path) for i, x in enumerate(jobs): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path) x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path) x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path) clean_normal, clean_dna, clean_rna = x1, x2, x3 x = normal_sample, cancer_sample, chrom, \ clean_normal, clean_dna, clean_rna, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs[i] = x # Generate the commands for doing variant calling. python = mlib.get_config("python", which_assert_file=True) # filterRadia.py calls the "blat" command, and there's no way # to set the path. Make sure "blat" is executable. if not filelib.which("blat"): # Find "blat" in the configuration and add it to the path. x = mlib.get_config("blat", which_assert_file=True) path, x = os.path.split(x) if os.environ["PATH"]: path = "%s:%s" % (os.environ["PATH"], path) os.environ["PATH"] = path # Make sure it's findable now. filelib.which_assert("blat") # STEP 1. Call variants with radia.py. # python radia.py test31 5 \ # -n bam04/PIM001_G.bam \ # -t bam04/196B-MG.bam \ # -r bam34/196B-MG.bam \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # -o test32.vcf # --dnaTumorMitochon MT \ # --rnaTumorMitochon MT \ sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.radia_py), cancer_sample, chrom, "-n", sq(normal_bamfile), "-t", sq(dna_tumor_bamfile), "-r", sq(rna_tumor_bamfile), "-f", sq(ref.fasta_file_full), "-o", radia_vcf_outfile, ] if "MT" in chroms: x += [ "--dnaNormalMitochon MT", "--dnaTumorMitochon MT", "--rnaTumorMitochon MT", ] x = " ".join(x) x = "%s >& %s" % (x, radia_logfile) commands.append(x) assert len(commands) == len(jobs) # Only uses ~200 Mb of ram. if RUN_VARIANT_CALLING: parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure log files are empty. logfiles = [x[10] for x in jobs] filelib.assert_exists_z_many(logfiles) # STEP 2. Filter variants with filterRadia.py. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.filterRadia_py), cancer_sample, chrom, sq(radia_vcf_outfile), sq(filter_outpath), sq(radia_files.scripts_dir), "-b", sq(radia_files.blacklist_dir), "-d", sq(radia_files.snp_dir), "-r", sq(radia_files.retro_dir), "-p", sq(radia_files.pseudo_dir), "-c", sq(radia_files.cosmic_dir), "-t", sq(radia_files.target_dir), "-s", sq(snp_eff_path), "-e", snp_eff_genome, "--rnaGeneBlckFile", sq(radia_files.rnageneblck_file), "--rnaGeneFamilyBlckFile", sq(radia_files.rnagenefamilyblck_file), ] x = " ".join(x) x = "%s >& %s" % (x, filter_logfile) commands.append(x) assert len(commands) == len(jobs) # Sometimes samtools crashes in the middle of a run. Detect # this case, and re-run the analysis if needed. assert len(commands) == len(jobs) py_commands = [] for x, cmd in zip(jobs, commands): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = cmd, cancer_sample, chrom, filter_logfile x = _run_filterRadia_with_restart, args, {} py_commands.append(x) # Takes ~10 Gb each. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) if FILTER_CALLS: parallel.pyfun(py_commands, num_procs=nc) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[11] for x in jobs] filelib.assert_exists_z_many(logfiles) # Make sure filter_vcf_outfile exists. outfiles = [x[7] for x in jobs] filelib.assert_exists_nz_many(outfiles) # STEP 3. Merge the results. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \ # radia2.tmp/ radia3.tmp # The "/" after radia2.tmp is important. If not given, # will generate some files with only newlines. fo = filter_outpath if not fo.endswith("/"): fo = "%s/" % fo x = [ sq(python), sq(radia_files.mergeChroms_py), cancer_sample, fo, merge_outpath, ] x = " ".join(x) x = "%s >& %s" % (x, merge_logfile) commands.append(x) assert len(commands) == len(jobs) # Since the chromosomes were separated for the previous steps, # this will generate one merge for each chromosome. This is # unnecessary, since we only need to merge once per sample. # Get rid of duplicates. commands = sorted({}.fromkeys(commands)) if MERGE_CALLS: parallel.pshell(commands, max_procs=num_cores) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[12] for x in jobs] logfiles = sorted({}.fromkeys(logfiles)) filelib.assert_exists_z_many(logfiles) # Fix the VCF files. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = normal_sample, cancer_sample, \ merge_vcf_outfile, final_vcf_outfile x = alignlib.clean_radia_vcf, args, {} commands.append(x) if FIX_VCF_FILES: parallel.pyfun(commands, num_procs=num_cores) # Make sure output VCF files exist. x = [x[9] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils ## Importing pysam is hard! #import sys #sys_path_old = sys.path[:] #sys.path = [x for x in sys.path if x.find("RSeQC") < 0] #import pysam #sys.path = sys_path_old bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) assert in_filename != out_filename x = in_filename, log_filename, out_filename jobs.append(x) # Don't do this. Need MD, NM, NH in # summarize_alignment_cigar. To be sure, just redo it. ## If the files already have MD tags, then just symlink the ## files. Don't add again. #i = 0 #while i < len(jobs): # in_filename, out_filename = jobs[i] # # handle = pysam.AlignmentFile(in_filename, "rb") # align = handle.next() # tag_dict = dict(align.tags) # if "MD" not in tag_dict: # i += 1 # continue # # Has MD tags. Just symlink and continue. # os.symlink(in_filename, out_filename) # del jobs[i] # Make a list of samtools commands. # Takes ~200 Mb per process, so should not be a big issue. samtools = filelib.which_assert(config.samtools) sq = parallel.quote commands = [] for x in jobs: in_filename, log_filename, out_filename = x # samtools calmd -b <in.bam> <ref.fasta> > <out.bam> # May generate error: # [bam_fillmd1] different NM for read # 'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19 # Pipe stderr to different file. x = [ samtools, "calmd", "-b", sq(in_filename), sq(ref.fasta_file_full), ] x = " ".join(x) x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel align_node = in_data x = filelib.list_files_in_path(align_node.identifier, endswith="align_summary.txt") align_filenames = x assert align_filenames, "Missing align_summary.txt" results = {} # dict of sample -> dictionary of output for filename in align_filenames: # Names must in the format: # <path>/<sample>.tophat/alignment_summary.txt # full_path <path>/<sample>.tophat # path <path> # tophat_dir <sample>.tophat # file_ accepted_hits.bam # sample <sample> full_path, file_ = os.path.split(filename) path, tophat_dir = os.path.split(full_path) assert file_ == "align_summary.txt" assert tophat_dir.endswith(".tophat") sample = tophat_dir[:-7] x = alignlib.parse_tophat_align_summary(filename) results[sample] = x # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def which(bin_name): from genomicode import filelib return filelib.which_assert(bin_name)
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \ # -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA" # -t <gtf_file> >& <log_filename>" # <out_path> Output directory. Will be created if not exists. # <in_filename> BAM file # <reference_file> /data/biocore/genomes/UCSC/mm10.fa # <gtf_file> /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf # # <reference_file> must be indexed and have a dict file. rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar) GTF = module_utils.get_user_option( user_options, "rna_seqc_gtf_file", not_empty=True) assert os.path.exists(GTF), "File not found: %s" % GTF # list of infile, out_path, ref_file, gtf_file, sample, log_file jobs = [] for in_filename in bam_filenames: p, file_ = os.path.split(in_filename) f, e = os.path.splitext(file_) sample = hashlib.hash_var(f) out_path_rna_seqc = os.path.join(out_path, sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \ sample, log_filename jobs.append(x) sq = parallel.quote commands = [] for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x x = [sample, in_filename, "NA"] x = "|".join(x) x = [ 'java', '-jar', rna_seqc_jar, '-o', sq(out_path_rna_seqc), '-r', sq(ref_filename), '-s', "'%s'" % x, '-t', gtf_filename, ] x = " ".join(x) cmd = "%s >& %s" % (x, log_filename) commands.append(cmd) # Gets lots of errors. x = parallel.pshell(commands, max_procs=num_cores) run_log = os.path.join(out_path, "run.log") open(run_log, 'w').write(x) # Check for outfile. # Make sure the analysis completed successfully. for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x filelib.assert_exists_nz(out_path_rna_seqc)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib #from genomicode import hashlib from Betsy import module_utils in_filenames = module_utils.find_bam_files(in_data.identifier) assert in_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] #seen = {} for i, in_filename in enumerate(in_filenames): p, f = os.path.split(in_filename) temp_prefix = "temp_%s" % f #temp_prefix = "temp_%s" % hashlib.hash_var(f) # Make sure no duplicates. #assert temp_prefix not in seen #seen[temp_prefix] = 1 #temp_outfilename = "%d.bam" % i out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, temp_prefix=temp_prefix, #temp_outfilename=temp_outfilename, out_filename=out_filename) jobs.append(x) samtools = filelib.which_assert(config.samtools) # Calculate the number of threads per process. nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores) num_threads = max(nc / len(jobs), 1) # Make a list of samtools commands. # Without -m, takes ~1 Gb per process. sq = parallel.quote commands = [] for j in jobs: # Usage has changed. Below no longer valid. # samtools sort <in_filename> <out_filestem> # .bam automatically added to <out_filestem>, so don't # need it. #x = out_filename #assert x.endswith(".bam") #x = x[:-4] #out_filestem = x x = [ sq(samtools), "sort", "-O", "bam", "-T", sq(j.temp_prefix), "-m", "4G", # Crashing, so try increasing memory. sq(j.in_filename), #"-o", sq(j.temp_outfilename), "-o", sq(j.out_filename), ] if num_threads > 1: x += ["-@", num_threads] x = " ".join(map(str, x)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = nc parallel.pshell(commands, max_procs=nc) #for cmd in commands: # parallel.sshell(cmd) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_outfilename, j.out_filename) # Make sure the analysis completed successfully. x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils bam_node, ref_node, pos_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Positions file has 0-based coordinates (like BAM files). # But samtools requires 1-based coordinates. Convert to # 1-based coordinates. positions_filename = "positions.txt" outhandle = open(positions_filename, 'w') for x in filelib.read_cols(pos_node.identifier): assert len(x) == 2 chrom, pos = x pos = int(pos) + 1 # convert from 0- to 1-based coords. x = chrom, pos print >> outhandle, "\t".join(map(str, x)) outhandle.close() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = filelib.GenericObject(in_filename=in_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) ## Get possible positions file. #positions_filename = module_utils.get_user_option( # user_options, "positions_file", check_file=True) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel", "consensus"] #if cov == "yes": # assert positions_filename, "Missing: positions_file" # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \ # $i > $j" samtools = filelib.which_assert(config.samtools) # Get an error if the BAM files are not indexed. # [W::bam_hdr_read] EOF marker is absent. The input is probably # truncated. #if vartype == "consensus": # args = [ # "-R", # Ignore read group tags. # "-B", # Disable BAQ (base quality) computation. # "-q", 0, # Skip bases with mapQ smaller than this. # "-Q", 0, # Skip bases with BAQ smaller than this. # "-d10000000", # Allow deep reads. # ] #else: # raise NotImplementedError args = [ "-R", # Ignore read group tags. "-B", # Disable BAQ (base quality) computation. "-q", 0, # Skip bases with mapQ smaller than this. "-Q", 0, # Skip bases with BAQ smaller than this. "-d10000000", # Allow deep reads. ] sq = parallel.quote commands = [] for j in jobs: x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] if positions_filename: x.extend(["-l", positions_filename]) x.extend(args) x.append(sq(j.in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename) commands.append(x) #for x in commands: # print x parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands # File may be empty if there are no reads. x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) # Make sure there's no errors in the log files. for j in jobs: check_log_file(j.err_filename) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import config from Betsy import module_utils as mlib mpileup_node = in_data mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier, endswith=".pileup") assert mpileup_filenames, "No .pileup files." #nc_match = mlib.read_normal_cancer_file(nc_node.identifier) #ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] tool = "mpileup2snp" if vartype == "indel": tool = "mpileup2indel" # list of (sample, in_filename, tmp1_filename, tmp2_filename, # out_filename) jobs = [] for in_filename in mpileup_filenames: p, sample, ext = mlib.splitpath(in_filename) tmp1_filename = os.path.join(out_path, "%s.tmp1" % sample) tmp2_filename = os.path.join(out_path, "%s.tmp2" % sample) out_filename = os.path.join(out_path, "%s.vcf" % sample) x = sample, in_filename, tmp1_filename, tmp2_filename, out_filename jobs.append(x) # VarScan will generate a "Parsing Exception" if there are 0 # reads in a location. Filter those out. sq = parallel.quote commands = [] for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x x = "awk -F'\t' '$4 != 0 {print}' %s > %s" % (in_filename, tmp1_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) x = [x[2] for x in jobs] filelib.assert_exists_nz_many(x) # java -jar /usr/local/bin/VarScan.jar <tool> $i --output_vcf 1 > $j varscan = filelib.which_assert(config.varscan_jar) # Make a list of commands. commands = [] for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x x = [ "java", "-jar", sq(varscan), tool, tmp1_filename, "--p-value", 0.05, "--output-vcf", 1, ] x = " ".join(map(str, x)) x = "%s >& %s" % (x, tmp2_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) x = [x[3] for x in jobs] filelib.assert_exists_nz_many(x) # Clean up the VCF files. VarScan leaves extraneous lines # there. for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x alignlib.clean_varscan_vcf(sample, tmp2_filename, out_filename) x = [x[-1] for x in jobs] filelib.assert_exists_nz_many(x) # The tmp files are really big. Don't save those. for x in jobs: sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x filelib.safe_unlink(tmp1_filename) filelib.safe_unlink(tmp2_filename)