def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import shutil from genomicode import filelib from genomicode import vcflib from Betsy import module_utils as mlib simple_file = in_data.identifier metadata = {} x = mlib.get_user_option(user_options, "remove_samples") x = x.split(",") x = [x.strip() for x in x] remove_samples = x x = mlib.get_user_option(user_options, "apply_filter", allowed_values=["no", "yes"]) apply_filter = (x == "yes") wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) name2caller = {} # name -> Caller object for caller in vcflib.CALLERS: caller = caller() assert caller.name not in name2caller name2caller[caller.name] = caller TEMPFILE = "temp.txt" handle = open(TEMPFILE, 'w') it = filelib.read_row(simple_file, header=1) print >> handle, "\t".join(it._header) for d in it: # Find the caller. assert d.Caller in name2caller, "Unknown caller: %s" % d.Caller caller = name2caller[d.Caller] # remove_sample if d.Sample in remove_samples: continue #if remove_radia_rna_samples and d.Sample.endswith("_RNA"): # continue # apply_filter if apply_filter: args = d.Filter, if d.Caller == "MuSE": args = d.Filter, wgs_or_wes if not caller.is_pass(*args): continue print >> handle, "\t".join(d._cols) handle.close() shutil.move(TEMPFILE, out_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample", not_empty=True) # Set the experiment name. name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample assert control_sample in samples, "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) control_filename = run_MACS14.find_bam_file(bam_path, control_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample assert control_filename, "Missing bam file for %s" % control_sample cmd = make_pyspp_command(treat_filename, control_filename, out_path, num_procs=num_cores) log_file = "%s.log" % experiment_name cmd = "%s >& %s" % (cmd, log_file) parallel.sshell(cmd, path=out_path) files = [ "binding.positions.txt", #"broadPeak", "crosscorrelation.pdf", "density.wig", "enrichment.estimates.wig", "enrichment.wig", #"narrowPeak", # might be empty if no peaks found log_file, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from Betsy import module_utils as mlib import cluster_genes_by_hierarchical as clust filelib.safe_mkdir(out_path) metadata = {} kmeans_k = mlib.get_user_option( user_options, "kmeans_k", not_empty=True, type=int) assert kmeans_k >= 2 and kmeans_k < 100 x = clust.run_cluster30( in_data.identifier, "kmeans", user_options, kmeans_k=kmeans_k) cmd, cluster_files = x metadata["command"] = cmd opj = os.path.join out_cdt_file = opj(out_path, "signal.cdt") out_kag_file = opj(out_path, "array_cluster.kag") out_kgg_file = opj(out_path, "gene_cluster.kgg") assert "cdt" in cluster_files shutil.copy2(cluster_files["cdt"], out_cdt_file) if "kag" in cluster_files: shutil.copy2(cluster_files["kag"], out_kag_file) if "kgg" in cluster_files: shutil.copy2(cluster_files["kgg"], out_kgg_file) return metadata
def relabel(data_file, rename_file, outfile, user_options): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib sample_header = mlib.get_user_option( user_options, "sample_labels_header", not_empty=True) # Make sure sample_header in rename file. x = open(rename_file).readline() x = x.rstrip("\r\n").split("\t") assert sample_header in x, "Missing header (%s): %s" % ( sample_header, rename_file) sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) x = "'%s,%s'" % (rename_file, sample_header) cmd = [ "python", sq(slice_matrix), '--relabel_col_ids', x, sq(data_file), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return cmd
def run_cluster30(filename, algorithm, user_options, **more_args): import arrayio from genomicode import cluster30 from Betsy import module_utils as mlib MATRIX_FILE = "data.pcl" DISTANCE_MEASURES = cluster30.DIST2ID.keys() YESNO = ["yes", "no"] cluster_genes = mlib.get_user_option(user_options, "cluster_genes", not_empty=True, allowed_values=YESNO) cluster_arrays = mlib.get_user_option(user_options, "cluster_arrays", not_empty=True, allowed_values=YESNO) distance_metric = mlib.get_user_option(user_options, "distance_measure", not_empty=True, allowed_values=DISTANCE_MEASURES) # Make a PCL-formatted file for cluster 3.0. It might # misinterpret the columns of a tab-delimited file. matrix = arrayio.read(filename) matrix = arrayio.convert(matrix, to_format=arrayio.pcl_format) arrayio.write(matrix, open(MATRIX_FILE, 'w')) jobname = "cluster" cmd = cluster30.cluster30_file(MATRIX_FILE, (cluster_genes == "yes"), (cluster_arrays == "yes"), algorithm, distance=distance_metric, jobname=jobname, **more_args) # Find the output files and name them appropriately. cluster_files = cluster30._find_cluster_files(jobname) fix_cluster30_dup_header(cluster_files["cdt"]) return cmd, cluster_files
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils vcf_node = in_data vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." filelib.safe_mkdir(out_path) buildver = module_utils.get_user_option(user_options, "buildver", allowed_values=["hg19"], not_empty=True) jobs = [] # list of (in_filename, log_filename, out_filestem) for in_filename in vcf_filenames: # Annovar takes a filestem, without the ".vcf". p, f = os.path.split(in_filename) f, exp = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % f) out_filestem = os.path.join(out_path, f) x = in_filename, log_filename, out_filestem jobs.append(x) # Make a list of commands. commands = [] for x in jobs: in_filename, log_filename, out_filestem = x x = alignlib.make_annovar_command(in_filename, log_filename, out_filestem, buildver) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-1] for x in jobs] # out_filestems x = ["%s.%s_multianno.vcf" % (x, buildver) for x in x] filelib.assert_exists_nz_many(x)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. MAX_CORES = 2 filenames = mlib.find_fastq_files(in_data.identifier) assert filenames, "I could not find any FASTQ files." filelib.safe_mkdir(out_path) metadata = {} num_samples = mlib.get_user_option(user_options, "num_samples", not_empty=True, type=int) metadata["num_samples"] = num_samples jobs = [] for in_filename in filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) cmds = [] for x in jobs: in_filename, out_filename = x x = copy_fastq_file, (in_filename, out_filename, num_samples), {} cmds.append(x) nc = min(MAX_CORES, num_cores) metadata["num cores"] = nc parallel.pyfun(cmds, num_procs=nc) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from genomicode import cluster30 from Betsy import module_utils as mlib filelib.safe_mkdir(out_path) metadata = {} LINKAGES = cluster30.METHOD2ID.keys() linkage = mlib.get_user_option(user_options, "linkage", not_empty=True, allowed_values=LINKAGES) x = run_cluster30(in_data.identifier, "hierarchical", user_options, method=linkage) cmd, cluster_files = x metadata["command"] = cmd opj = os.path.join out_cdt_file = opj(out_path, "signal.cdt") out_atr_file = opj(out_path, "array_tree.atr") out_gtr_file = opj(out_path, "gene_tree.gtr") assert "cdt" in cluster_files shutil.copy2(cluster_files["cdt"], out_cdt_file) if "atr" in cluster_files: shutil.copy2(cluster_files["atr"], out_atr_file) if "gtr" in cluster_files: shutil.copy2(cluster_files["gtr"], out_gtr_file) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils mvcf_node = in_data in_filename = mvcf_node.identifier filelib.assert_exists_nz(in_filename) buildver = module_utils.get_user_option(user_options, "buildver", allowed_values=["hg19"], not_empty=True) # Annovar takes a filestem, without the ".vcf". p, f = os.path.split(in_filename) f, exp = os.path.splitext(f) log_filename = "%s.log" % f p, f = os.path.split(out_filename) f, exp = os.path.splitext(f) out_filestem = f cmd = alignlib.make_annovar_command(in_filename, log_filename, out_filestem, buildver) parallel.sshell(cmd) # Make sure the analysis completed successfully. x = "%s.%s_multianno.vcf" % (out_filestem, buildver) filelib.assert_exists_nz(x) if os.path.realpath(x) != os.path.realpath(out_filename): shutil.copy2(x, out_filename)
def plot_heatmap(filename, outfile, cluster_files, user_options): from genomicode import parallel from genomicode import graphlib from Betsy import module_utils as mlib python = mlib.get_config( "python", which_assert_file=True, assert_exists=True) arrayplot = mlib.get_config( "arrayplot", which_assert_file=True, assert_exists=True) COLORS = [ "red", "white", "red-green", "blue-yellow", "red-green-soft", "red-blue-soft", "matlab", "bild", "genepattern", "genespring", "yahoo", "brewer-prgn-div", "brewer-rdbu-div", "brewer-rdylbu-div", "brewer-rdylgn-div", "brewer-spectral-div", "brewer-blues-seq", "brewer-greens-seq", "brewer-reds-seq", "brewer-ylorbr-seq", "brewer-qual-set", ] YESNO = ["no", "yes"] hm_width = mlib.get_user_option(user_options, "hm_width", type=int) hm_height = mlib.get_user_option(user_options, "hm_height", type=int) hm_color = mlib.get_user_option( user_options, "hm_color", allowed_values=COLORS, not_empty=True) hm_colorbar = mlib.get_user_option( user_options, "hm_colorbar", not_empty=True, allowed_values=YESNO) hm_colorbar_horizontal = mlib.get_user_option( user_options, "hm_colorbar_horizontal", not_empty=True, allowed_values=YESNO) hm_colorbar_height = mlib.get_user_option( user_options, "hm_colorbar_height", not_empty=True, type=float) hm_colorbar_width = mlib.get_user_option( user_options, "hm_colorbar_width", not_empty=True, type=float) hm_colorbar_font = mlib.get_user_option( user_options, "hm_colorbar_font", not_empty=True, type=float) hm_label_genes = mlib.get_user_option( user_options, "hm_label_genes", allowed_values=YESNO) hm_scale_gene_labels = mlib.get_user_option( user_options, "hm_scale_gene_labels", not_empty=True, type=float) hm_label_arrays = mlib.get_user_option( user_options, "hm_label_arrays", allowed_values=YESNO) hm_scale_array_labels = mlib.get_user_option( user_options, "hm_scale_array_labels", not_empty=True, type=float) hm_show_gene_tree = None hm_show_array_tree = None hm_show_gene_cluster = None hm_show_array_cluster = None if "hm_show_gene_tree" in user_options: hm_show_gene_tree = mlib.get_user_option( user_options, "hm_show_gene_tree", allowed_values=YESNO, not_empty=True) hm_show_array_tree = mlib.get_user_option( user_options, "hm_show_array_tree", allowed_values=YESNO, not_empty=True) hm_show_gene_cluster = mlib.get_user_option( user_options, "hm_show_gene_cluster", allowed_values=YESNO, not_empty=True) hm_show_array_cluster = mlib.get_user_option( user_options, "hm_show_array_cluster", allowed_values=YESNO, not_empty=True) # Set default values. if not hm_width or not hm_height: nrow, ncol = get_matrix_size(filename) fn = graphlib.find_wide_heatmap_size if nrow > ncol: fn = graphlib.find_tall_heatmap_size x = fn( nrow, ncol, max_total_height=4096, max_total_width=4096, max_box_height=200, max_box_width=200) hm_width, hm_height = x if not hm_label_genes: nrow, ncol = get_matrix_size(filename) hm_label_genes = "no" if nrow <= 50: hm_label_genes = "yes" if not hm_label_arrays: nrow, ncol = get_matrix_size(filename) hm_label_arrays = "no" if ncol <= 50: hm_label_arrays = "yes" # Check values. assert hm_width >= 1 and hm_width <= 256, "Invalid width: %s" % hm_width assert hm_height >= 1 and hm_height <= 256, \ "Invalid height: %s" % hm_height assert hm_scale_gene_labels > 0 and hm_scale_gene_labels < 10 assert hm_scale_array_labels > 0 and hm_scale_array_labels < 10 sq = parallel.quote cmd = [ sq(python), sq(arrayplot), "--grid", "-x", hm_width, "-y", hm_height, "--color", hm_color, ] if hm_colorbar == "yes": cmd += [ "--colorbar", "--cb_height", hm_colorbar_height, "--cb_width", hm_colorbar_width, "--cb_font", hm_colorbar_font, ] if hm_colorbar_horizontal == "yes": cmd += ["--cb_horizontal"] if hm_label_genes == "yes": cmd += [ "--label_genes", "--scale_gene_labels", hm_scale_gene_labels, ] if hm_label_arrays == "yes": cmd += [ "--label_arrays", "--scale_array_labels", hm_scale_array_labels, ] if hm_show_gene_tree == "yes" and "gtr" in cluster_files: cmd += ["--gene_tree_file", cluster_files["gtr"]] if hm_show_array_tree == "yes" and "atr" in cluster_files: cmd += ["--array_tree_file", cluster_files["atr"]] if hm_show_gene_cluster == "yes" and "kgg" in cluster_files: cmd += ["--gene_cluster_file", cluster_files["kgg"]] if hm_show_array_cluster == "yes" and "kag" in cluster_files: cmd += ["--array_cluster_file", cluster_files["kag"]] cmd += [ sq(filename), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) return cmd
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import genesetlib from genomicode import parallel from genomicode import filelib from Betsy import module_utils as mlib in_data = antecedents if not os.path.exists(out_path): os.mkdir(out_path) metadata = {} merge = mlib.get_user_option(user_options, "merge_up_and_down_genes", not_empty=True, allowed_values=["yes", "no"]) merge = (merge == "yes") opj = os.path.join gs_filename = opj(out_path, "gene_sets.gmt") intersect_filename = opj(out_path, "intersection.gmt") count_filename = opj(out_path, "pairwise_count_matrix.txt") venn_plot_file = opj(out_path, "venn.tiff") # Make a list of all the data sets in the antecedents. # <stem>.nocutoff.txt # <stem>.<cutoff>.txt # <stem>.<cutoff>.gmt # <stem>.<cutoff>.heatmap.png x = os.listdir(in_data.identifier) x = sorted(x) x = [x for x in x if x.endswith(".gmt")] x = [x for x in x if x.find("nocuttof") < 0] x = [opj(in_data.identifier, x) for x in x] filtered_geneset_files = x assert filtered_geneset_files, "Missing: filtered geneset files" # For each of the filtered_geneset_files, figure out the # <stem>. This is tricky because <cutoff> may contain # multiple dots. # <stem>.fdr_0.05.p_0.05.fc_1.5.gmt stems = [None] * len(filtered_geneset_files) for i, x in enumerate(filtered_geneset_files): x = filtered_geneset_files[i] x = os.path.split(x)[1] x = x.split(".") j = 0 while j < len(x): if x[j].startswith("fdr_") or x[j].startswith("fc_") or \ x[j].startswith("p_") or x[j] == "gmt": x = x[:j] else: j += 1 x = ".".join(x) stems[i] = x genesets = [] geneset_stems = [] for i, filename in enumerate(filtered_geneset_files): for x in genesetlib.read_genesets(filename): name, description, genes = x x = genesetlib.GeneSet(name, description, genes) genesets.append(x) geneset_stems.append(stems[i]) assert genesets, "I could not find any gene sets" # Should contain gene sets whose name fits the pattern: # <NAME>_ID_UP # <NAME>_ID_DN # <NAME>_NAME_UP # <NAME>_NAME_DN # Only want the _ID_ gene sets for comparison. I = [i for (i, x) in enumerate(genesets) if x.name.find("_ID_") >= 0] genesets = [genesets[i] for i in I] geneset_stems = [geneset_stems[i] for i in I] assert genesets, "I could not find any '_ID_' gene sets" # Rename each of the gene sets. for i, gs in enumerate(genesets): name = gs.name n = "%s_%s" % (geneset_stems[i], name) if merge: # If I'm merging, then which gene set is UP or DN # doesn't matter. suffix = name[-6:] assert suffix in ["_ID_UP", "_ID_DN"] n = "%s%s" % (geneset_stems[i], suffix) gs.name = n # Write out the gene sets. genesetlib.write_gmt(gs_filename, genesets) # Count the number of gene sets. x = [x.name for x in genesets] if merge: x = [x.replace("_DN", "_UP") for x in x] x = [x.replace("_DOWN", "_UP") for x in x] num_genesets = len({}.fromkeys(x)) calc_venn = mlib.get_config("calc_venn", which_assert_file=True) sq = parallel.quote cmd = [ sq(calc_venn), "-o", sq(intersect_filename), "--all_genesets", "--num_to_compare", 2, ] if num_genesets <= 5: # Can only plot up to 5 circles. cmd += ["--plotfile", sq(venn_plot_file)] if merge: cmd += ["--automatch"] cmd.append(sq(gs_filename)) cmd = " ".join(map(str, cmd)) cmd = "%s >& %s" % (cmd, sq(count_filename)) parallel.sshell(cmd) metadata["commands"] = [cmd] # Make a heatmap of the counts. UNCLUSTERED_FILE = "unclustered.txt" CLUSTERED_FILE = "clustered.txt" COL_TREE_FILE = "col_tree.txt" ROW_TREE_FILE = "row_tree.txt" HEATMAP_FILE = opj(out_path, "heatmap.counts.png") # Make a file with the counts. outhandle = open(UNCLUSTERED_FILE, 'w') for line in open(count_filename): if not line.strip(): break outhandle.write(line) outhandle.close() # Cluster the counts. slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) arrayplot = mlib.get_config("arrayplot", which_assert_file=True) cmd = [ sq(slice_matrix), "--reorder_col_cluster", "--col_tree_file", sq(COL_TREE_FILE), "--reorder_row_cluster", "--row_tree_file", sq(ROW_TREE_FILE), sq(UNCLUSTERED_FILE), ] cmd = "%s > %s" % (" ".join(cmd), sq(CLUSTERED_FILE)) parallel.sshell(cmd) metadata["commands"].append(cmd) filelib.assert_exists_nz(CLUSTERED_FILE) # Draw the heatmap. cmd = [ sq(arrayplot), "--grid", "--array_tree_file", sq(COL_TREE_FILE), "--al", "--gene_tree_file", sq(ROW_TREE_FILE), "--gl", "--colorbar", "--color", "brewer-greens-seq", sq(CLUSTERED_FILE), sq(HEATMAP_FILE), ] cmd = " ".join(cmd) parallel.sshell(cmd) metadata["commands"].append(cmd) filelib.assert_exists_nz(HEATMAP_FILE) mlib.txt2xls(count_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, ref_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) # Do a quick check to make sure the reference is correct. # Otherwise, error may be hard to disgnose. alignlib.assert_is_STAR_reference(ref.path) metadata = {} metadata["tool"] = "STAR %s" % alignlib.get_STAR_version() x = mlib.get_user_option(user_options, "two_pass", allowed_values=["no", "yes"]) two_pass = (x == "yes") # Figure out the strandedness. is_stranded = stranded.stranded != "unstranded" # STAR --runThreadN 40 --genomeDir test05 \ # --readFilesIn test.fastq/test03_R1_001.fastq \ # test.fastq/test03_R2_001.fastq --outFileNamePrefix test06. # If unstranded, add --outSAMstrandField intronMotif # Make a list of the jobs to run. jobs = [] # list of filelib.GenericObject objects for x in fastq_files: sample, pair1, pair2 = x pass1_out_prefix = "p1.%s." % sample pass2_out_prefix = "%s." % sample pass1_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass1_out_prefix) pass2_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass2_out_prefix) sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample) log1_filename = os.path.join(out_path, "p1.%s.log" % sample) log2_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, pair1=pair1, pair2=pair2, pass1_out_prefix=pass1_out_prefix, pass2_out_prefix=pass2_out_prefix, pass1_bam_filename=pass1_bam_filename, pass2_bam_filename=pass2_bam_filename, sjdb_filename=sjdb_filename, log1_filename=log1_filename, log2_filename=log2_filename, ) jobs.append(x) # Run pass 1. commands = [] for j in jobs: x = os.path.join(out_path, j.pass1_out_prefix) cmd = alignlib.make_STAR_command(ref.path, x, num_cores, is_stranded, j.pair1, j.pair2, j.log1_filename) # For debugging. If this file already exists, skip it. if not filelib.exists_nz(j.pass1_bam_filename): parallel.sshell(cmd, path=out_path) filelib.assert_exists_nz(j.pass1_bam_filename) commands.append(cmd) if two_pass: # Make a new index with the splice junction information. sj_index = os.path.join(out_path, "genome.2pass") x = [x.sjdb_filename for x in jobs] filelib.assert_exists_nz_many(x) x = alignlib.make_STAR_index_command(ref.fasta_file_full, sj_index, sjdb_files=x, num_cores=num_cores) x = "%s >& genome.2pass.log" % x commands.append(x) # For debugging. If this file already exists, skip it. if not filelib.exists_nz("genome.2pass.log"): parallel.sshell(x, path=out_path) alignlib.assert_is_STAR_reference(sj_index) # Run pass 2. for j in jobs: # For debugging. If this file already exists, skip it. if os.path.exists(j.pass2_bam_filename): continue if two_pass: x = os.path.join(out_path, j.pass2_out_prefix) cmd = alignlib.make_STAR_command(sj_index, x, num_cores, is_stranded, j.pair1, j.pair2, j.log2_filename) parallel.sshell(cmd, path=out_path) commands.append(cmd) else: # link pass1_bam_filename to pass2_bam_filename os.symlink(j.pass1_bam_filename, j.pass2_bam_filename) continue filelib.assert_exists_nz(j.pass2_bam_filename) metadata["commands"] = commands metadata["num_cores"] = num_cores # STAR takes 28 Gb per process. Make sure we don't use up # more memory than is available on the machine. # Defaults: # --limitGenomeGenerateRAM 31000000000 # --outFilterMismatchNmax 10 Num mismatches. #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores) #metadata["num_cores"] = nc #parallel.pshell(commands, max_procs=nc, path=out_path) # Make sure the analysis completed successfully. #x = [x[-2] for x in jobs] # sam_filename #filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_filename): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib fastq_node, sample_node, align_node = antecedents fastq_data = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_data, "I could not find any FASTQ files." align_filenames = filelib.list_files_in_path(align_node.identifier, endswith=".matches.txt") assert align_filenames, "No .matches.txt files." align_filenames.sort() metadata = {} assert len(fastq_data) == len(align_filenames), \ "Mismatch: num samples %d %d" % ( len(fastq_data), len(align_filenames)) num_mismatches = mlib.get_user_option(user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2fastqdata = {} for x in fastq_data: sample, f1, f2 = x sample2fastqdata[sample] = x # list of (sample, align_filename, summary_filename, # fastq_filename1, fastq_filename2) jobs = [] for in_filename in align_filenames: p, f = os.path.split(in_filename) # <sample>.matches.txt ext = ".matches.txt" assert f.endswith(ext) sample = f[:-len(ext)] assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample summary_filename = "%s.summary.txt" % sample x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample] x = sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, align_filename, summary_filename, \ fastq_file1, fastq_file2 = x args = align_filename, fastq_file1, fastq_file2, num_mismatches keywds = { "temp_path": ".", "outfile": summary_filename, } x = summarize_matches_file, args, keywds jobs2.append(x) # Since this can take a lot of memory (depending on the number # of reads, can easily take 8 Gb), do just 1 process at a # time. Also, I/O intensive. Don't do too many at a time. #MAX_PROCS = 1 MAX_PROCS = 4 nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS) #nc = min(MAX_PROCS, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1) metadata["num_cores"] = nc assert len(results) == len(jobs2) # Put together the results in a table. handle = open(out_filename, 'w') header = "sample", "match", "total", "RPM", "match", "mismatch" print >> handle, "\t".join(header) for x in zip(jobs, results): x, d = x sample, in_filename, summary_filename, \ fastq_filename1, fastq_filename2 = x match = d["perfect_alignments"] total = d["total_alignments"] rpm = int(float(match) / total * 1E6) perc_match = d["perc_perfect"] perc_mismatch = 1 - d["perc_perfect"] x = sample, match, total, rpm, perc_match, perc_mismatch assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") fragment_length = module_utils.get_user_option( user_options, "peakseq_fragment_length", not_empty=True, type=int) mappability_file = module_utils.get_user_option(user_options, "mappability_file", not_empty=True, check_file=True) assert fragment_length > 0 and fragment_length < 1000 # Set the experiment name. name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) control_filename = run_MACS14.find_bam_file(bam_path, control_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample assert control_filename, "Missing bam file for %s" % control_sample cmd = make_peakseq_command(treat_filename, control_filename, out_path, experiment_name, fragment_length, mappability_file) log_file = "%s.log" % experiment_name cmd = "%s >& %s" % (cmd, log_file) parallel.sshell(cmd, path=out_path) files = [ "config.dat", log_file, "%s.txt" % experiment_name, # Can be length 0, if no peaks found. #"%s_narrowPeak.txt" % experiment_name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_folder, sample_node, gene_node, strand_node = antecedents bam_path = bam_folder.identifier assert filelib.dir_exists(bam_path) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} attr2order = { "name": "name", "coordinate": "pos", } x = bam_folder.data.attributes["sorted"] sort_order = attr2order.get(x) assert sort_order, "Cannot handle sorted: %s" % x #attr2stranded = { # "single" : "no", # "paired" : "no", # "paired_ff" : None, # "paired_fr" : "yes", # "paired_rf" : "reverse", # } #x = sample_node.data.attributes["orientation"] #stranded = attr2stranded.get(x) #assert stranded, "Cannot handle orientation: %s" % x ht_stranded = None if stranded.stranded == "unstranded": ht_stranded = "no" elif stranded.stranded == "firststrand": ht_stranded = "reverse" elif stranded.stranded == "secondstrand": ht_stranded = "yes" assert ht_stranded is not None #gtf_file = mlib.get_user_option( # user_options, "gtf_file", not_empty=True) #assert os.path.exists(gtf_file), "File not found: %s" % gtf_file mode = mlib.get_user_option(user_options, "htseq_count_mode", allowed_values=[ "union", "intersection-strict", "intersection-nonempty" ]) # Make a list of the jobs to run. jobs = [] for bam_filename in filelib.list_files_in_path(bam_path, endswith=".bam", case_insensitive=True): x = os.path.split(bam_filename)[1] x = os.path.splitext(x)[0] x = "%s.count" % x out_file = x x = bam_filename, out_file jobs.append(x) # Generate commands for each of the files. sq = parallel.quote commands = [] for x in jobs: bam_filename, out_file = x x = alignlib.make_htseq_count_command(bam_filename, gtf_file, sort_order, ht_stranded, mode=mode) x = "%s >& %s" % (x, sq(out_file)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores, path=out_path) # Make sure the analysis completed successfully. x = [x[1] for x in jobs] x = [os.path.join(out_path, x) for x in x] output_filenames = x filelib.assert_exists_nz_many(output_filenames) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out MuTect version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals"] cosmic_file = mlib.get_user_option( user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option( user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile, # coverage_outfile, vcf_outfile, logfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) call_outfile = opj(out_path, "%s.call_stats.out" % sample) cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample) raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile jobs.append(x) # java -Xmx2g -jar muTect.jar # --analysis_type MuTect # --reference_sequence <reference> # --cosmic <cosmic.vcf> # --dbsnp <dbsnp.vcf> # --intervals <intervals_to_process> # --input_file:normal <normal.bam> # --input_file:tumor <tumor.bam> # --out <call_stats.out> # --coverage_file <coverage.wig.txt> # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x UNHASHABLE = [ ("input_file:normal", sq(normal_bamfile)), ("input_file:tumor", sq(cancer_bamfile)), ] x = alignlib.make_MuTect_command( analysis_type="MuTect", reference_sequence=sq(ref.fasta_file_full), cosmic=sq(cosmic_file), dbsnp=sq(dbsnp_file), intervals=sq(interval_node.identifier), out=sq(call_outfile), coverage_file=sq(cov_outfile), vcf=sq(raw_vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, x in enumerate(jobs): normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x # Pull out the error lines. x = [x for x in open(log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect error [%s]:\n%s\n%s" % ( cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x[6] for x in jobs] filelib.assert_exists_many(x) # Fix the files. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \ log_outfile = x alignlib.clean_mutect_vcf( normal_bamfile, cancer_bamfile, normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \ # -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA" # -t <gtf_file> >& <log_filename>" # <out_path> Output directory. Will be created if not exists. # <in_filename> BAM file # <reference_file> /data/biocore/genomes/UCSC/mm10.fa # <gtf_file> /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf # # <reference_file> must be indexed and have a dict file. rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar) GTF = module_utils.get_user_option( user_options, "rna_seqc_gtf_file", not_empty=True) assert os.path.exists(GTF), "File not found: %s" % GTF # list of infile, out_path, ref_file, gtf_file, sample, log_file jobs = [] for in_filename in bam_filenames: p, file_ = os.path.split(in_filename) f, e = os.path.splitext(file_) sample = hashlib.hash_var(f) out_path_rna_seqc = os.path.join(out_path, sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \ sample, log_filename jobs.append(x) sq = parallel.quote commands = [] for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x x = [sample, in_filename, "NA"] x = "|".join(x) x = [ 'java', '-jar', rna_seqc_jar, '-o', sq(out_path_rna_seqc), '-r', sq(ref_filename), '-s', "'%s'" % x, '-t', gtf_filename, ] x = " ".join(x) cmd = "%s >& %s" % (x, log_filename) commands.append(cmd) # Gets lots of errors. x = parallel.pshell(commands, max_procs=num_cores) run_log = os.path.join(out_path, "run.log") open(run_log, 'w').write(x) # Check for outfile. # Make sure the analysis completed successfully. for x in jobs: (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \ sample, log_filename) = x filelib.assert_exists_nz(out_path_rna_seqc)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib in_vcf_node, bf_vcf_node = antecedents in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier, endswith=".vcf", toplevel_only=True) bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier, endswith=".vcf", toplevel_only=True) filelib.safe_mkdir(out_path) metadata = {} common_only = mlib.get_user_option(user_options, "backfill_common_only", allowed_values=["no", "yes"], not_empty=True) in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames] bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames] # Make sure there are no duplicate sample names. x1 = {}.fromkeys(in_vcf_samples).keys() x2 = {}.fromkeys(bf_vcf_samples).keys() assert len(in_vcf_samples) == len(x1), "Duplicate samples" assert len(bf_vcf_samples) == len(x2), "Duplicate samples" # Find the samples. common = [x for x in in_vcf_samples if x in bf_vcf_samples] in_only = [x for x in in_vcf_samples if x not in common] bf_only = [x for x in bf_vcf_samples if x not in common] assert common, "No common samples." pretty_in = parselib.pretty_list(in_only, max_items=5) pretty_bf = parselib.pretty_list(bf_only, max_items=5) if common_only == "no": assert not (in_only and bf_only), \ "Extra samples in both sets:\n%s\n%s" % ( pretty_in, pretty_bf) assert not in_only, "Target VCF file has extra samples: %s" % \ pretty_in assert not bf_only, "Source VCF file has extra samples: %s." % \ pretty_bf SAMPLES = common # list of sample, in_vcf_filename, bf_vcf_filename, out_filename jobs = [] for sample in SAMPLES: assert sample in in_vcf_samples assert sample in bf_vcf_samples i = in_vcf_samples.index(sample) j = bf_vcf_samples.index(sample) in_filename = in_vcf_filenames[i] bf_filename = bf_vcf_filenames[j] out_filename = os.path.join(out_path, "%s.vcf" % sample) x = sample, in_filename, bf_filename, out_filename jobs.append(x) jobs2 = [] for x in jobs: sample, in_filename, bf_filename, out_filename = x fn = backfill_vcf args = in_filename, bf_filename, out_filename keywds = {} jobs2.append((fn, args, keywds)) #num_cores = 1 parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "MuSE %s" % alignlib.get_muse_version() wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) dbsnp_file = mlib.get_user_option(user_options, "muse_dbsnp_vcf", not_empty=True, check_file=True) # Make sure dbsnp_file is compressed and indexed. assert dbsnp_file.endswith(".vcf.gz"), \ "muse_dbsnp_vcf must be bgzip compressed." x = "%s.tbi" % dbsnp_file assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed." # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, # logfile1, logfile2) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) muse_call_stem = opj(out_path, "%s.call" % cancer_sample) muse_call_file = "%s.MuSE.txt" % muse_call_stem raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample) vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample) log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 jobs.append(x) # Generate the commands. # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\ # bam04/196B-MG.bam bam04/PIM001_G.bam # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \ # -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz MuSE = mlib.findbin("muse") sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "call", "-O", muse_call_stem, "-f", sq(ref.fasta_file_full), cancer_bamfile, normal_bamfile, ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile1) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[8] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the call files are created and not empty. call_files = [x[5] for x in jobs] filelib.assert_exists_nz_many(call_files) # Run the "sump" step. commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x x = [ sq(MuSE), "sump", "-I", sq(muse_call_file), ] assert wgs_or_wes in ["wgs", "wes"] if wgs_or_wes == "wgs": x += ["-G"] else: x += ["-E"] x += [ "-O", sq(raw_vcf_outfile), "-D", sq(dbsnp_file), ] x = " ".join(x) x = "%s >& %s" % (x, log_outfile2) commands.append(x) assert len(commands) == len(jobs) # Not sure about RAM. nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = metadata["commands"] + commands # Make sure the log files have no errors. The files should be # empty. log_files = [x[9] for x in jobs] filelib.assert_exists_z_many(log_files) # Make sure the raw files are created and not empty. vcf_files = [x[6] for x in jobs] filelib.assert_exists_nz_many(vcf_files) # Fix the files. commands = [] # Should be python commands. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile x = alignlib.clean_muse_vcf, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Delete the log_outfiles if empty. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \ log_outfile1, log_outfile2 = x if os.path.exists(log_outfile1): os.unlink(log_outfile1) if os.path.exists(log_outfile2): os.unlink(log_outfile2) # Make sure output VCF files exist. x = [x[7] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals" ] cosmic_file = mlib.get_user_option(user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option(user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(normal_sample=normal_sample, cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, vcf_outfile=vcf_outfile, log_outfile=log_outfile) jobs.append(x) # java -jar GenomeAnalysisTK.jar \ # -T MuTect2 \ # -R reference.fasta \ # -I:tumor tumor.bam \ # -I:normal normal.bam \ # [--dbsnp dbSNP.vcf] \ # [--cosmic COSMIC.vcf] \ # [-L targets.interval_list] \ # -o output.vcf # Generate the commands. sq = mlib.sq commands = [] for j in jobs: UNHASHABLE = [ ("I:normal", sq(normal_bamfile)), ("I:tumor", sq(cancer_bamfile)), # --dbsnp and --cosmic use two dashes, for some # reason. Since make_GATK_command only uses one dash, # add one manually. ("-dbsnp", sq(dbsnp_file)), ("-cosmic", sq(cosmic_file)), ] x = alignlib.make_GATK_command( T="MuTect2", R=sq(ref.fasta_file_full), L=sq(interval_node.identifier), o=sq(j.vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, j.log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, j in enumerate(jobs): # Pull out the error lines. x = [x for x in open(j.log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x.vcf_outfile for x in jobs] filelib.assert_exists_many(x) # Mutect2 names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for j in jobs: call_somatic_varscan._fix_normal_cancer_names( j.vcf_outfile, j.normal_sample, j.cancer_sample) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): #from genomicode import filelib from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simple_file = in_data.identifier metadata = {} num_callers = mlib.get_user_option(user_options, "num_callers", not_empty=True, type=int) assert num_callers >= 0 and num_callers < 100 var_matrix = SimpleVariantMatrix.read(simple_file) annot_matrix = var_matrix.annot_matrix call_matrix = var_matrix.call_matrix # For each coord and sample, count the number of callers. coord2sample2nc = {} # (chrom, pos, ref, alt) -> sample -> num callers for x in call_matrix.coord2samplecaller2call.iteritems(): coord, samplecaller2call = x if coord not in coord2sample2nc: coord2sample2nc[coord] = {} sample2nc = coord2sample2nc[coord] for (sample, caller), call in samplecaller2call.iteritems(): # Make sure this is a real call. if not (call.num_ref or call.num_alt or call.total or call.vaf): continue sample2nc[sample] = sample2nc.get(sample, 0) + 1 # Make a list of the coordinates that have the right number of calls. calls = {} # coord -> sample -> nc for coord, sample2nc in coord2sample2nc.iteritems(): for sample, nc in sample2nc.iteritems(): if nc < num_callers: continue if coord not in calls: calls[coord] = {} calls[coord][sample] = nc handle = open(out_filename, 'w') # Print out the matrix. header = annot_matrix.headers + var_matrix.samples print >> handle, "\t".join(header) # Cache for convenience. j2annots = {} for j, h in enumerate(annot_matrix.headers_h): annots = annot_matrix.header2annots[h] j2annots[j] = annots num_annots = len(j2annots) chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] pos = [int(x) for x in pos] for i, coord in enumerate(zip(chrom, pos, ref, alt)): if coord not in calls: continue row0 = [None] * num_annots for j in range(num_annots): row0[j] = j2annots[j][i] row1 = [""] * len(var_matrix.samples) for j, sample in enumerate(var_matrix.samples): if sample in calls[coord]: row1[j] = coord2sample2nc[coord][sample] row = row0 + row1 assert len(row) == len(header) print >> handle, "\t".join(map(str, row)) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import itertools from genomicode import SimpleVariantMatrix from genomicode import AnnotationMatrix from Betsy import module_utils as mlib summary_file = in_data.identifier metadata = {} #x = mlib.get_user_option( # user_options, "nonsynonymous_and_stopgain_only", # allowed_values=["no", "yes"]) #nonsynonymous_and_stopgain_only = (x == "yes") min_alt_reads = mlib.get_user_option(user_options, "filter_by_min_alt_reads", not_empty=True, type=int) assert min_alt_reads >= 0 and min_alt_reads < 10000 min_total_reads = mlib.get_user_option(user_options, "filter_by_min_total_reads", not_empty=True, type=int) assert min_total_reads >= 0 and min_total_reads < 10000 min_vaf = mlib.get_user_option(user_options, "filter_by_min_vaf", not_empty=True, type=float) assert min_vaf >= 0.0 and min_vaf < 1.0 #min_gq = mlib.get_user_option( # user_options, "filter_by_min_GQ", not_empty=True, type=float) #assert min_gq >= 0 and min_gq < 1000 assert min_total_reads or min_alt_reads, "No filter" matrix = SimpleVariantMatrix.read_as_am(summary_file) #var_matrix = SimpleVariantMatrix.read(summary_file) #call_matrix = var_matrix.call_matrix #annot_matrix = var_matrix.annot_matrix #annovar_matrix = None #for (name, matrix) in var_matrix.named_matrices: # if "ExonicFunc.refGene" in matrix.headers: # annovar_matrix = matrix # break #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene" # copy.deepcopy is very slow. Try to avoid it. # Strategy: # 1. Make a list of the changes to be made. # 2. Save the filtered rows. # 3. Make the changes. # 4. Save the non-filtered rows. I_remove = {} # i -> 1 call_remove = {} # i -> (sample, caller) -> 1 #CHROM = matrix.header2annots["______Chrom"] #POS = matrix.header2annots["______Pos"] #POS = [int(x) for x in POS] #REF = matrix.header2annots["______Ref"] #ALT = matrix.header2annots["______Alt"] # Optimization: normalize the headers for the samples and callers. sc2header = {} # (sample, caller) -> header_h for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header = "%s___%s___Ref/Alt/VAF" % (sample, caller) header_h = matrix.normalize_header(header) assert header_h sc2header[sc] = header_h for i in range(matrix.num_annots()): has_calls = False # whether this row has any calls. for sc in itertools.product(matrix.samples, matrix.callers): sample, caller = sc header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] if not call_str: continue call = SimpleVariantMatrix._parse_call(call_str) filt = False # filter_by_min_alt_reads if min_alt_reads > 0 and \ (call.num_alt is None or call.num_alt < min_alt_reads): filt = True # filter_by_min_total_reads if min_total_reads > 0 and (call.total is None or call.total < min_total_reads): filt = True # filter_by_min_vaf if min_vaf >= 1E-6 and (call.vaf is None or call.vaf < min_vaf): filt = True if filt: if i not in call_remove: call_remove[i] = {} call_remove[i][sc] = 1 else: has_calls = True # If this coordinate has no more calls, then remove the # whole row. if not has_calls: I_remove[i] = 1 I_remove = sorted(I_remove) # Write out a matrix of the discarded rows. filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove) SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix) # Remove the calls. for i in call_remove: for sc in call_remove[i]: header_h = sc2header[sc] call_str = matrix.header2annots[header_h][i] assert call_str matrix.header2annots[header_h][i] = "" # Which rows to keep. I_remove_dict = {}.fromkeys(I_remove) I_keep = [ i for i in range(matrix.num_annots()) if i not in I_remove_dict ] filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep) SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix) ## ## Filter out synonymous variants. ## #if nonsynonymous_and_stopgain_only: ## # # Make sure annotated with Annovar. ## # assert "ExonicFunc.refGene" in annovar_matrix.headers ## # exonic_func = annovar_matrix["ExonicFunc.refGene"] ## # for i, efunc in enumerate(exonic_func): ## # efunc = exonic_func[i] ## # assert efunc in [ ## # "", "nonsynonymous SNV", "synonymous SNV", ## # "stopgain", "stoploss", ## # "frameshift substitution", "nonframeshift substitution", ## # "unknown"], \ ## # "Unknown exonic_func: %s" % efunc ## # if efunc not in ["nonsynonymous SNV", "stopgain"]: ## # I_remove[i] = 1 ## # continue ## # Filter based on the calls. ## if min_alt_reads > 0 or min_total_reads > 0: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## # SimpleVariantMatrix.Call object. ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_alt_reads ## if min_alt_reads > 0 and \ ## (call.num_alt is None or call.num_alt < min_alt_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # filter_by_min_total_reads ## if min_total_reads > 0 and ( ## call.total is None or call.total < min_total_reads): ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # Filter based on VAF. ## if min_vaf >= 1E-6: ## all_coord = call_matrix.coord2samplecaller2call.keys() ## for coord in all_coord: ## all_sc = call_matrix.coord2samplecaller2call[coord].keys() ## for sc in all_sc: ## call = call_matrix.coord2samplecaller2call[coord][sc] ## # filter_by_min_vaf ## if call.vaf is None or call.vaf < min_vaf: ## if coord not in call_remove: ## call_remove[coord] = {} ## call_remove[coord][sc] = 1 ## # If any of these coordinates have no more variants, then ## # remove the whole row. ## if call_remove: ## chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"] ## ref, alt = annot_matrix["Ref"], annot_matrix["Alt"] ## pos = [int(x) for x in pos] ## coord2i = {} ## for i, coord in enumerate(zip(chrom, pos, ref, alt)): ## coord2i[coord] = i ## for coord in call_remove: ## num_remove = len(call_remove[coord]) ## num_calls = len(call_matrix.coord2samplecaller2call[coord]) ## assert num_remove <= num_calls ## if num_remove == num_calls: ## i = coord2i[coord] ## I_remove[i] = 1 ## # Make a matrix of the discarded rows. ## old_annot_matrix = var_matrix.annot_matrix ## old_named_matrices = var_matrix.named_matrices ## filtered_matrix = var_matrix ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove) ## filtered_matrix.annot_matrix = x ## named_matrices = [] ## for (name, matrix) in var_matrix.named_matrices: ## matrix = AnnotationMatrix.rowslice(matrix, I_remove) ## named_matrices.append((name, matrix)) ## filtered_matrix.named_matrices = named_matrices ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix) ## var_matrix.annot_matrix = old_annot_matrix ## var_matrix.named_matrices = old_named_matrices ## # Remove the calls. ## for coord in call_remove: ## chrom, pos, ref, alt = coord ## for (sample, caller) in call_remove[coord]: ## var_matrix.call_matrix.set_call( ## chrom, pos, ref, alt, sample, caller, None) ## # Which rows to keep. ## I_keep = [ ## i for i in range(var_matrix.num_variants()) if i not in I_remove] ## # Filter annotation matrix ## var_matrix.annot_matrix = AnnotationMatrix.rowslice( ## var_matrix.annot_matrix, I_keep) ## # Filter named matrices. ## for i, (name, matrix) in enumerate(var_matrix.named_matrices): ## matrix = AnnotationMatrix.rowslice(matrix, I_keep) ## var_matrix.named_matrices[i] = (name, matrix) ## SimpleVariantMatrix.write(out_filename, var_matrix) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib # For debugging. RUN_VARIANT_CALLING = True FILTER_CALLS = True MERGE_CALLS = True FIX_VCF_FILES = True dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier) assert dna_bam_filenames, "No DNA .bam files." rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier) assert rna_bam_filenames, "No RNA .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "Radia %s" % alignlib.get_radia_version() ## Make sure the BAM files do not contain spaces in the ## filenames. Radia doesn't work well with spaces. #filenames = dna_bam_filenames + rna_bam_filenames #has_spaces = [] #for filename in filenames: # if filename.find(" ") >= 0: # has_spaces.append(filename) #x = has_spaces #if len(x) > 5: # x = x[:5] + ["..."] #x = ", ".join(x) #msg = "Radia breaks if there are spaces in filenames: %s" % x #assert not has_spaces, msg # sample -> bam filename dnasample2bamfile = mlib.root2filename(dna_bam_filenames) rnasample2bamfile = mlib.root2filename(rna_bam_filenames) # Make sure files exist for all the samples. The DNA-Seq # should have both normal and cancer. RNA is not needed for # normal sample. mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile) mlib.assert_normal_cancer_samples(nc_match, rnasample2bamfile, ignore_normal_sample=True) # Make sure Radia and snpEff are configured. radia_genome_assembly = mlib.get_user_option(user_options, "radia_genome_assembly", not_empty=True) assert radia_genome_assembly == "hg19", "Only hg19 handled." snp_eff_genome = mlib.get_user_option(user_options, "snp_eff_genome", not_empty=True) radia_path = mlib.get_config("radia_path", assert_exists=True) snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True) radia_files = get_radia_files(radia_path, radia_genome_assembly) # Make a list of the chromosomes to use. Pick an arbitrarily # BAM file. Look at only the chromosomes that are present in # all files. all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values() chroms = list_common_chromosomes(all_bamfiles) assert chroms, "No chromosomes found in all files." # Only use the chromosomes that can be filtered by Radia. chroms = filter_radia_chromosomes(chroms, radia_files) # Make output directories. radia_outpath = "radia1.tmp" filter_outpath = "radia2.tmp" merge_outpath = "radia3.tmp" if not os.path.exists(radia_outpath): os.mkdir(radia_outpath) if not os.path.exists(filter_outpath): os.mkdir(filter_outpath) if not os.path.exists(merge_outpath): os.mkdir(merge_outpath) # Steps: # 1. Call variants (radia.py) # -o <file.vcf> # 2. Filter variants (filterRadia.py) # <outpath> # Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf # 3. Merge (mergeChroms.py) # Takes as input: <filter_outpath> # Produces: <merge_outpath>/<patient_id>.vcf # list of (normal_sample, cancer_sample, chrom, # normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, # radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, # final_vcf_outfile, # radia_logfile, filter_logfile, merge_logfile) opj = os.path.join jobs = [] for i, (normal_sample, cancer_sample) in enumerate(nc_match): normal_bamfile = dnasample2bamfile[normal_sample] dna_tumor_bamfile = dnasample2bamfile[cancer_sample] rna_tumor_bamfile = rnasample2bamfile[cancer_sample] merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample) merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample) final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample) for chrom in chroms: radia_vcf_outfile = opj( radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) filter_vcf_outfile = opj( filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom)) radia_logfile = opj(radia_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) filter_logfile = opj(filter_outpath, "%s_chr%s.log" % (cancer_sample, chrom)) x = normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs.append(x) # Since Radia doesn't work well if there are spaces in the # filenames, symlink these files here to guarantee that there # are no spaces. normal_path = "normal.bam" dna_path = "dna.bam" rna_path = "rna.bam" if not os.path.exists(normal_path): os.mkdir(normal_path) if not os.path.exists(dna_path): os.mkdir(dna_path) if not os.path.exists(rna_path): os.mkdir(rna_path) for i, x in enumerate(jobs): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path) x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path) x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path) clean_normal, clean_dna, clean_rna = x1, x2, x3 x = normal_sample, cancer_sample, chrom, \ clean_normal, clean_dna, clean_rna, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile jobs[i] = x # Generate the commands for doing variant calling. python = mlib.get_config("python", which_assert_file=True) # filterRadia.py calls the "blat" command, and there's no way # to set the path. Make sure "blat" is executable. if not filelib.which("blat"): # Find "blat" in the configuration and add it to the path. x = mlib.get_config("blat", which_assert_file=True) path, x = os.path.split(x) if os.environ["PATH"]: path = "%s:%s" % (os.environ["PATH"], path) os.environ["PATH"] = path # Make sure it's findable now. filelib.which_assert("blat") # STEP 1. Call variants with radia.py. # python radia.py test31 5 \ # -n bam04/PIM001_G.bam \ # -t bam04/196B-MG.bam \ # -r bam34/196B-MG.bam \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # -o test32.vcf # --dnaTumorMitochon MT \ # --rnaTumorMitochon MT \ sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.radia_py), cancer_sample, chrom, "-n", sq(normal_bamfile), "-t", sq(dna_tumor_bamfile), "-r", sq(rna_tumor_bamfile), "-f", sq(ref.fasta_file_full), "-o", radia_vcf_outfile, ] if "MT" in chroms: x += [ "--dnaNormalMitochon MT", "--dnaTumorMitochon MT", "--rnaTumorMitochon MT", ] x = " ".join(x) x = "%s >& %s" % (x, radia_logfile) commands.append(x) assert len(commands) == len(jobs) # Only uses ~200 Mb of ram. if RUN_VARIANT_CALLING: parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure log files are empty. logfiles = [x[10] for x in jobs] filelib.assert_exists_z_many(logfiles) # STEP 2. Filter variants with filterRadia.py. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x x = [ sq(python), sq(radia_files.filterRadia_py), cancer_sample, chrom, sq(radia_vcf_outfile), sq(filter_outpath), sq(radia_files.scripts_dir), "-b", sq(radia_files.blacklist_dir), "-d", sq(radia_files.snp_dir), "-r", sq(radia_files.retro_dir), "-p", sq(radia_files.pseudo_dir), "-c", sq(radia_files.cosmic_dir), "-t", sq(radia_files.target_dir), "-s", sq(snp_eff_path), "-e", snp_eff_genome, "--rnaGeneBlckFile", sq(radia_files.rnageneblck_file), "--rnaGeneFamilyBlckFile", sq(radia_files.rnagenefamilyblck_file), ] x = " ".join(x) x = "%s >& %s" % (x, filter_logfile) commands.append(x) assert len(commands) == len(jobs) # Sometimes samtools crashes in the middle of a run. Detect # this case, and re-run the analysis if needed. assert len(commands) == len(jobs) py_commands = [] for x, cmd in zip(jobs, commands): normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = cmd, cancer_sample, chrom, filter_logfile x = _run_filterRadia_with_restart, args, {} py_commands.append(x) # Takes ~10 Gb each. nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) if FILTER_CALLS: parallel.pyfun(py_commands, num_procs=nc) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[11] for x in jobs] filelib.assert_exists_z_many(logfiles) # Make sure filter_vcf_outfile exists. outfiles = [x[7] for x in jobs] filelib.assert_exists_nz_many(outfiles) # STEP 3. Merge the results. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \ # radia2.tmp/ radia3.tmp # The "/" after radia2.tmp is important. If not given, # will generate some files with only newlines. fo = filter_outpath if not fo.endswith("/"): fo = "%s/" % fo x = [ sq(python), sq(radia_files.mergeChroms_py), cancer_sample, fo, merge_outpath, ] x = " ".join(x) x = "%s >& %s" % (x, merge_logfile) commands.append(x) assert len(commands) == len(jobs) # Since the chromosomes were separated for the previous steps, # this will generate one merge for each chromosome. This is # unnecessary, since we only need to merge once per sample. # Get rid of duplicates. commands = sorted({}.fromkeys(commands)) if MERGE_CALLS: parallel.pshell(commands, max_procs=num_cores) metadata["commands"] += commands # Make sure log files are empty. logfiles = [x[12] for x in jobs] logfiles = sorted({}.fromkeys(logfiles)) filelib.assert_exists_z_many(logfiles) # Fix the VCF files. commands = [] for x in jobs: normal_sample, cancer_sample, chrom, \ normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \ radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \ final_vcf_outfile, \ radia_logfile, filter_logfile, merge_logfile = x args = normal_sample, cancer_sample, \ merge_vcf_outfile, final_vcf_outfile x = alignlib.clean_radia_vcf, args, {} commands.append(x) if FIX_VCF_FILES: parallel.pyfun(commands, num_procs=num_cores) # Make sure output VCF files exist. x = [x[9] for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node, target_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." target_filenames = filelib.list_files_in_path(target_node.identifier, endswith=".intervals") assert target_filenames, "No .intervals files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) assert len(bam_filenames) == len(target_filenames), \ "Should have an .intervals file for each bam file." sample2bamfilename = {} for filename in bam_filenames: p, f = os.path.split(filename) sample, ext = os.path.splitext(f) assert sample not in sample2bamfilename sample2bamfilename[sample] = filename sample2targetfilename = {} for filename in target_filenames: p, f = os.path.split(filename) sample, ext = os.path.splitext(f) assert sample not in sample2targetfilename sample2targetfilename[sample] = filename assert len(sample2bamfilename) == len(sample2targetfilename) missing = [ x for x in sample2bamfilename if x not in sample2targetfilename ] assert not missing, "Missing interval files for %d bam files." % \ len(missing) # list of (bam_filename, target_filename, log_filename, out_filename) jobs = [] for sample in sample2bamfilename: bam_filename = sample2bamfilename[sample] target_filename = sample2targetfilename[sample] p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = bam_filename, target_filename, log_filename, out_filename jobs.append(x) known_sites = [] x1 = module_utils.get_user_option(user_options, "realign_known_sites1", check_file=True) x2 = module_utils.get_user_option(user_options, "realign_known_sites2", check_file=True) x3 = module_utils.get_user_option(user_options, "realign_known_sites3", check_file=True) x = [x1, x2, x3] x = [x for x in x if x] known_sites = x assert known_sites # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar \ # -T IndelRealigner -R <ref.fa> \ # -I <bam_file> -targetIntervals <target_file> -o <bam_file> # Make a list of commands. commands = [] for x in jobs: bam_filename, target_filename, log_filename, out_filename = x x = [("known", x) for x in known_sites] x = alignlib.make_GATK_command(T="IndelRealigner", R=ref.fasta_file_full, I=bam_filename, targetIntervals=target_filename, o=out_filename, _UNHASHABLE=x) x = "%s >& %s" % (x, log_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents in_filenames = mlib.find_bam_files(bam_node.identifier) assert in_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # list of (in_filename, log_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) f, ext = os.path.splitext(f) log_filename = os.path.join(out_path, "%s.log" % f) out_filename = os.path.join(out_path, "%s.intervals" % f) x = in_filename, log_filename, out_filename jobs.append(x) filter_reads_with_N_cigar = mlib.get_user_option( user_options, "filter_reads_with_N_cigar", allowed_values=["no", "yes"]) known_sites = [] x1 = mlib.get_user_option(user_options, "realign_known_sites1", check_file=True) x2 = mlib.get_user_option(user_options, "realign_known_sites2", check_file=True) x3 = mlib.get_user_option(user_options, "realign_known_sites3", check_file=True) x = [x1, x2, x3] x = [x for x in x if x] known_sites = x assert known_sites # I/O bound, so not likely to get a big speedup with nt. # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4 # -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j # --known <known_vcf_file> # RealignerTargetCreator takes ~10Gb per process. Each thread # takes the full amount of memory. nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores) # Make a list of commands. commands = [] for x in jobs: in_filename, log_filename, out_filename = x n = max(1, nc / len(jobs)) x = [("-known", x) for x in known_sites] if filter_reads_with_N_cigar == "yes": x.append(("-filter_reads_with_N_cigar", None)) x = alignlib.make_GATK_command(nt=n, T="RealignerTargetCreator", R=ref.fasta_file_full, I=in_filename, o=out_filename, _UNHASHABLE=x) x = "%s >& %s" % (x, log_filename) commands.append(x) parallel.pshell(commands, max_procs=nc) metadata["num_procs"] = nc metadata["commands"] = commands # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from genomicode import cluster30 from Betsy import module_utils as mlib import cluster_genes_by_hierarchical filelib.safe_mkdir(out_path) metadata = {} raise NotImplementedError DISTANCE_MEASURES = cluster30.DIST2ID.keys() YESNO = ["yes", "no"] cluster_genes = mlib.get_user_option(user_options, "cluster_genes", not_empty=True, allowed_values=YESNO) cluster_arrays = mlib.get_user_option(user_options, "cluster_arrays", not_empty=True, allowed_values=YESNO) distance_metric = mlib.get_user_option( user_options, "distance_measure", not_empty=True, allowed_values=DISTANCE_MEASURES) som_rows = mlib.get_user_option(user_options, "som_rows", not_empty=True, type=int) som_cols = mlib.get_user_option(user_options, "som_cols", not_empty=True, type=int) assert som_rows >= 1 and som_rows < 100 assert som_cols >= 1 and som_cols < 100 jobname = "cluster" cmd = cluster30.cluster30_file(in_data.identifier, (cluster_genes == "yes"), (cluster_arrays == "yes"), "som", distance=distance_metric, som_rows=som_rows, som_cols=som_cols, jobname=jobname) metadata["command"] = cmd # Find the output files and name them appropriately. cluster_files = cluster30._find_cluster_files(jobname) cluster_genes_by_hierarchical.fix_cluster30_dup_header( cluster_files["cdt"]) opj = os.path.join out_cdt_file = opj(out_path, "signal.cdt") #out_kag_file = opj(out_path, "array_cluster.kag") #out_kgg_file = opj(out_path, "gene_cluster.kgg") assert "txt" in cluster_files shutil.copy2(cluster_files["txt"], out_cdt_file) #if "kag" in cluster_files: # shutil.copy2(cluster_files["kag"], out_kag_file) #if "kgg" in cluster_files: # shutil.copy2(cluster_files["kgg"], out_kgg_file) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils vcf_node, ref_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) jobs = [] for in_filename in vcf_filenames: p, f = os.path.split(in_filename) f, exp = os.path.splitext(f) out_filename = os.path.join(out_path, "%s.grp" % f) log_filename = os.path.join(out_path, "%s.log" % f) recal_filename = os.path.join(out_path, "%s.recalibrate_SNP.recal" % f) tranches_filename = os.path.join(out_path, "%s.recalibrate_SNP.tranches" % f) rscript_filename = os.path.join(out_path, "%s.recalibrate_SNP_plots.R" % f) assert in_filename != out_filename x = (in_filename, log_filename, recal_filename, tranches_filename, rscript_filename) jobs.append(x) # -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 # dbsnp_135.b37.vcf # -resource:hapmap,known=false,training=true,truth=true,prior=15.0 # hapmap_3.3.b37.sites.vcf # -resource:1000G,known=false,training=true,truth=false,prior=10.0 # 1000G_phase1.snps.high_confidence.vcf # -resource:omni,known=false,training=true,truth=false,prior=12.0 # 1000G_omni2.5.b37.sites.vcf known_sites = [] x1 = module_utils.get_user_option(user_options, "vcf_recal_dbsnp", not_empty=True, check_file=True) x2 = module_utils.get_user_option(user_options, "vcf_recal_mills_indels", not_empty=True, check_file=True) x3 = module_utils.get_user_option(user_options, "vcf_recal_1kg_indels", not_empty=True, check_file=True) x4 = module_utils.get_user_option(user_options, "vcf_recal_omni", not_empty=True, check_file=True) y1 = "resource:dbsnp,known=true,training=false,truth=false,prior=6.0" y2 = "resource:hapmap,known=false,training=true,truth=true,prior=15.0" y3 = "resource:1000G,known=false,training=true,truth=false,prior=10.0" y4 = "resource:omni,known=false,training=true,truth=false,prior=12.0" known_sites = [(y1, x1), (y2, x2), (y3, x3), (y4, x4)] # Names of annotations to be used for annotations. AN = [ "DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum", "InbreedingCoeff" ] TRANCHE = ["100.0", "99.9", "99.0", "90.0"] # Make a list of commands. commands = [] for x in jobs: (in_filename, log_filename, recal_filename, tranches_filename, rscript_filename) = x x1 = known_sites x2 = [("an", x) for x in AN] x3 = [("tranche", x) for x in TRANCHE] unhash = x1 + x2 + x3 x = alignlib.make_GATK_command(T="VariantRecalibrator", R=ref.fasta_file_full, input=in_filename, mode="SNP", recalFile=recal_filename, tranchesFile=tranches_filename, rscriptFile=rscript_filename, _UNHASHABLE=unhash) x = "%s >& %s" % (x, log_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. out_filenames = [x[-1] for x in jobs] filelib.assert_exists_nz_many(out_filenames)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib simplematrix_file = in_data.identifier filelib.assert_exists_nz(simplematrix_file) metadata = {} x = mlib.get_user_option(user_options, "nonsynonymous_and_stopgain_only", allowed_values=["no", "yes"]) nonsynonymous_and_stopgain_only = (x == "yes") x = mlib.get_user_option(user_options, "sift_polyphen_damaging", allowed_values=["no", "yes"]) sift_polyphen_damaging = (x == "yes") min_coverage_in_every_sample = None min_callers_in_every_sample = None min_callers_in_any_sample = None min_gene_expression_in_every_sample = None x = mlib.get_user_option(user_options, "min_coverage_in_every_sample", type=int) if x != "": min_coverage_in_every_sample = x x = mlib.get_user_option(user_options, "min_callers_in_every_sample", type=int) if x != "": min_callers_in_every_sample = x x = mlib.get_user_option(user_options, "min_callers_in_any_sample", type=int) if x != "": min_callers_in_any_sample = x x = mlib.get_user_option(user_options, "min_gene_expression_in_every_sample", type=float) if x != "": min_gene_expression_in_every_sample = x assert not (min_callers_in_every_sample and min_callers_in_any_sample) assert nonsynonymous_and_stopgain_only or \ sift_polyphen_damaging or \ min_callers_in_every_sample or \ min_callers_in_any_sample or \ min_gene_expression_in_every_sample or \ min_coverage_in_every_sample, \ "No filters" MATRIX = SimpleVariantMatrix.read_as_am(simplematrix_file) commands = [] #in_attrs = in_data.data.attributes if nonsynonymous_and_stopgain_only: # Actually, just look into the file instead. #assert in_attrs["annotated"] == "yes" MATRIX = filter_nonsynonymous(MATRIX) commands.append("Keep only nonsynonymous and stopgain variants.") if sift_polyphen_damaging: MATRIX = filter_sift_polyphen_damaging(MATRIX) commands.append("Keep only if predicted to be damaging by " "SIFT or Polyphen2.") if min_coverage_in_every_sample is not None: MATRIX = filter_min_coverage_in_every_sample( MATRIX, min_coverage_in_every_sample) commands.append("Keep only variants with coverage >= %d " "in every sample." % min_coverage_in_every_sample) if min_callers_in_every_sample is not None: MATRIX = filter_min_callers_in_every_sample( MATRIX, min_callers_in_every_sample) commands.append("Keep only variants called with >= %d callers " "in every sample." % min_callers_in_every_sample) if min_callers_in_any_sample is not None: MATRIX = filter_min_callers_in_any_sample( MATRIX, min_callers_in_any_sample) commands.append("Keep only variants called with >= %d callers " "in at least one sample." % min_callers_in_any_sample) if min_gene_expression_in_every_sample is not None: # Actually, just look into the file instead. #assert in_attrs["with_gxp"] == "yes" MATRIX = filter_min_gene_expression_in_every_sample( MATRIX, min_gene_expression_in_every_sample) commands.append("Keep only variants with gene expression >= %g " "in every sample." % min_gene_expression_in_every_sample) metadata["commands"] = commands SimpleVariantMatrix.write_from_am(out_filename, MATRIX) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) shiftsize = module_utils.get_user_option(user_options, "macs_shiftsize") if shiftsize: shiftsize = int(shiftsize) # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = find_bam_file(bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs14_command(treat_filename, control_filename, name=name, genome_size=genome_size, shiftsize=shiftsize, save_bedgraph_file=True) parallel.sshell(cmd, path=out_path) # Run Rscript on the model, if one was generated. model_file = os.path.join(out_path, "%s_model.r" % name) if os.path.exists(model_file): Rscript = filelib.which_assert(config.Rscript) cmd = [parallel.quote(Rscript), model_file] parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, "%s_summits.bed" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import math from genomicode import filelib from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import SimpleVariantMatrix from Betsy import module_utils as mlib svm_node = in_data filelib.assert_exists_nz(svm_node.identifier) linked_file = mlib.get_user_option(user_options, "linked_variants_file", not_empty=True, check_file=True) # Read the variant file. SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier) CHROM = SVM["______Chrom"] POS = SVM["______Pos"] POS = [int(x) for x in POS] all_coords = {} # (chrom, pos) -> 1 for x in zip(CHROM, POS): all_coords[x] = 1 # Read the linked variant file. # Chrom Pos Perc Linked p coord2info = {} # (chrom, pos) -> d for d in filelib.read_row(linked_file, header=1): pos = int(d.Pos) if (d.Chrom, pos) not in all_coords: continue coord2info[(d.Chrom, pos)] = d # Align the linked annotations to the matrix. MAX_SCORE = 1000 min_p = 10**-(MAX_SCORE / 10) linked_headers = ["Perc Linked", "Score"] annotations = [] for (chrom, pos) in zip(CHROM, POS): if (chrom, pos) not in coord2info: x = [""] * len(linked_headers) annotations.append(x) continue d = coord2info[(chrom, pos)] score = MAX_SCORE if float(d.p) >= min_p: score = -10 * math.log(float(d.p), 10) x = d.Perc_Linked, score assert len(x) == len(linked_headers) annotations.append(x) # Convert the headers and annotations to SVM format. linked_headers = ["Linkage______%s" % x for x in linked_headers] linked_annotations = jmath.transpose(annotations) # Make the new SimpleVariantMatrix. # Figure out where to put these annotations. INDEX = 4 ## If Annovar exists, put after. #I = [i for (i, x) in enumerate(SVM.headers) # if x.upper().startswith("ANNOVAR")] #if I: # INDEX = max(INDEX, max(I)+1) headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:] x = [SVM.header2annots[x] for x in SVM.headers_h] all_annots = x[:INDEX] + linked_annotations + x[INDEX:] merged = AnnotationMatrix.create_from_annotations( headers, all_annots, headerlines=SVM.headerlines) SimpleVariantMatrix.write_from_am(outfile, merged)