def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bwa = filelib.which_assert(config.bwa) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bwa index <out_stem.fa> # Makes files: # <out_stem>.fa.amb .ann .bwt .pac .sa sq = parallel.quote cmd = [ sq(bwa), "index", sq(ref.fasta_file_full), ] parallel.sshell(cmd, path=out_path) # Make sure the indexing worked properly. EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"] for ext in EXTENSIONS: f = "%s%s" % (ref.fasta_file_full, ext) assert filelib.exists_nz(f), "Missing: %s" % f
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample", not_empty=True) # Set the experiment name. name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample assert control_sample in samples, "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) control_filename = run_MACS14.find_bam_file(bam_path, control_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample assert control_filename, "Missing bam file for %s" % control_sample cmd = make_pyspp_command(treat_filename, control_filename, out_path, num_procs=num_cores) log_file = "%s.log" % experiment_name cmd = "%s >& %s" % (cmd, log_file) parallel.sshell(cmd, path=out_path) files = [ "binding.positions.txt", #"broadPeak", "crosscorrelation.pdf", "density.wig", "enrichment.estimates.wig", "enrichment.wig", #"narrowPeak", # might be empty if no peaks found log_file, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib bowtie2_build = filelib.which_assert(config.bowtie2_build) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) # bowtie2-build <ref.fa> <output_stem> # Makes files: # <output_stem>.[1234].bt2 # <output_stem>.rev.[12].bt2 sq = parallel.quote cmd = [ sq(bowtie2_build), sq(ref.fasta_file_full), ref.name, ] parallel.sshell(cmd, path=out_path) # Check to make sure index was created successfully. f = os.path.join(out_path, "%s.1.bt2" % ref.name) assert filelib.exists_nz(f)
def relabel(data_file, rename_file, outfile, user_options): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib sample_header = mlib.get_user_option( user_options, "sample_labels_header", not_empty=True) # Make sure sample_header in rename file. x = open(rename_file).readline() x = x.rstrip("\r\n").split("\t") assert sample_header in x, "Missing header (%s): %s" % ( sample_header, rename_file) sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) x = "'%s,%s'" % (rename_file, sample_header) cmd = [ "python", sq(slice_matrix), '--relabel_col_ids', x, sq(data_file), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return cmd
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} #module_utils.plot_line_keywd(in_data.identifier, 'biotin', outfile) lineplot = mlib.get_config("lineplot", which_assert_file=True) sq = parallel.quote cmd = [ sq(lineplot), "--gene_names", "biotin", "--mar_bottom", 1.50, "--yaxis_starts_at_0", sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from genomicode import config signal_node = in_data signal_file = signal_node.identifier assert os.path.exists(signal_file) slice_matrix = filelib.which_assert(config.slice_matrix) sq = parallel.quote cmd = [ sq(slice_matrix), "--cpm", signal_file, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import itertools from genomicode import config from genomicode import parallel from genomicode import filelib signal_node, annotation_node = antecedents signal_filename = signal_node.identifier annotation_filename = annotation_node.identifier filelib.assert_exists_nz(signal_filename) filelib.assert_exists_nz(annotation_filename) metadata = {} align_matrices = filelib.which_assert(config.align_matrices) # Make sure the signal_filename has an ID_REF header. header = filelib.read_cols(signal_filename).next() assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \ signal_filename signal_align_file = "signal.aligned.txt" annot_align_file = "annot.aligned.txt" # First, align the two files. sq = parallel.quote cmd = [ sq(align_matrices), "--annot_file", signal_filename, "--header", "ID_REF", "--annot_file", annotation_filename, "--left_join", signal_align_file, annot_align_file, ] cmd = " ".join(cmd) parallel.sshell(cmd) metadata["command"] = cmd # Now merge them. Take the first column of the expression # file (should be ID_REF), the whole annotation file, then the # remainder of the expression file. signal_handle = filelib.read_cols(signal_align_file) annot_handle = filelib.read_cols(annot_align_file) outhandle = open(outfile, 'w') for x1, x2 in itertools.izip(signal_handle, annot_handle): x = [x1[0]] + x2 + x1[1:] print >> outhandle, "\t".join(x) outhandle.close() #cmd = "paste %s %s > %s" % ( # annot_align_file, signal_align_file, outfile) #shell.single(cmd) filelib.assert_exists_nz(outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} ## data_node, cls_node = antecedents ## a, b, c = read_label_file.read(cls_node.identifier) ## if len(a) > 1: ## colors = [] ## for i in range(5): ## colors.append(cm.hot(i / 5.0, 1)) ## colors.append(cm.autumn(i / 5.0, i)) ## colors.append(cm.cool(i / 5.0, i)) ## colors.append(cm.jet(i / 5.0, i)) ## colors.append(cm.spring(i / 5.0, i)) ## colors.append(cm.prism(i / 5.0, i)) ## colors.append(cm.summer(i / 5.0, i)) ## colors.append(cm.winter(i / 5.0, i)) ## opts = [colors[int(i)] for i in b] ## legend = [c[int(i)] for i in b] ## plot_pca(data_node.identifier, outfile, opts, legend) #num_genes = mlib.get_user_option( # user_options, "pca_num_genes", type=int) #assert num_genes >= 5 and num_genes < 1E5 #metadata["num_genes"] = num_genes pcaplot = mlib.get_config("pcaplot", which_assert_file=True) prism_file = "prism.txt" row_pc_file = "row_components.txt" col_pc_file = "col_components.txt" sq = parallel.quote cmd = [ sq(pcaplot), "--label", #"-g", num_genes, "--prism_file", prism_file, "--row_pc_file", row_pc_file, "--col_pc_file", col_pc_file, sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def get_paired_stranded_rseqc(reference_bed, bam_filename): from genomicode import alignlib from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib script = alignlib.find_rseqc_script("infer_experiment.py") filelib.assert_exists_nz(reference_bed) filelib.assert_exists_nz(bam_filename) # RSeQC scripts use #!/usr/bin/python, which may not be the right # one. Use the python on the path. cmd = [ "python", mlib.sq(script), "-r", mlib.sq(reference_bed), "-i", mlib.sq(bam_filename), ] cmd = " ".join(cmd) x = parallel.sshell(cmd) x = parse_rseqc_infer_experiment(x) #single_or_paired, stranded, frac_failed, frac_first, frac_second = x return x
def count_reads(fastq_filename): # Requires an uncompressed fastq file. from genomicode import filelib from genomicode import parallel sq = parallel.quote # Make sure it's a fastq file. # @M03807:17:000000000-AHGYH:1:1101:20554:1508 1:N:0:16 # CTTTACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGA # + # <BCC@FAFEC8,C<8968<@EEEFFCCFEC@EDEFGGGGA,@,@EFGGF9,,88,@FFA< handle = filelib.openfh(fastq_filename) x = [handle.readline() for i in range(4)] x = [x.strip() for x in x] x = [x for x in x] assert len(x) == 4 assert len(x[1]) == len(x[3]) assert x[2] == "+" wc_out = parallel.sshell("wc -l %s" % sq(fastq_filename)) # velocitron:biocore$ wc -l test01.txt # 22278 test01.txt # 0 test 1.txt x = wc_out.strip().split() assert len(x) >= 2, "Unknown format from wc -l\n" % wc_out num_lines, filename = x[0], " ".join(x[1:]) num_lines = int(num_lines) num_reads = num_lines / 4 return num_reads
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from genomicode import filelib from genomicode import parallel from genomicode import arrayplatformlib as apl from Betsy import module_utils as mlib in_data = antecedents metadata = {} M = arrayio.read(in_data.identifier) cat2header = apl.categorize_headers(M) header = cat2header.get(apl.GENE_SYMBOL) if header is None: header = cat2header.get(apl.GENE_ID) assert header is not None, "I could not find gene IDs or symbols: %s" \ % in_data.identifier metadata["dedup_header"] = header slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) sq = parallel.quote algorithm = out_attributes['unique_genes'] if algorithm == "average_genes": raise NotImplementedError elif algorithm == "high_var": dedup_cmd = ["--dedup_row_by_var", sq(header)] pass elif algorithm == "first_gene": raise NotImplementedError else: raise AssertionError, "Unknown algorithm: %s" % algorithm cmd = [ sq(slice_matrix), ] cmd += dedup_cmd cmd += [sq(in_data.identifier)] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import shutil import arrayio from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filename = in_data.identifier filelib.assert_exists_nz(filename) # De-duplicate by every single header. Not sure if this is # right. MATRIX = arrayio.read(filename) # Figure out which columns has duplicates. has_dup = [] for name in MATRIX.row_names(): annots = MATRIX.row_names(name) assert name not in has_dup seen = {} for annot in annots: if annot in seen: has_dup.append(name) break seen[annot] = 1 if not has_dup: shutil.copy2(filename, outfile) return sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) for i, name in enumerate(has_dup): f = "outfile.%d.txt" % i x = [ sq(slice_matrix), "--dedup_row_by_var", sq(name), sq(filename), ">&", sq(f), ] x = " ".join(map(str, x)) parallel.sshell(x) shutil.copy2(f, outfile)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import jmath from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} norm_para = ["variance", "sum_of_squares"] assert "gene_normalize" in out_attributes normalize = out_attributes["gene_normalize"] assert normalize in norm_para, \ "Invalid normalize option: %s" % normalize if normalize == "variance": f = file(outfile, 'w') M = arrayio.read(in_data.identifier, format=arrayio.pcl_format) M_n = jmath.safe_norm_mv(M.slice()) M._X = M_n M_c = arrayio.convert(M, to_format=arrayio.pcl_format) arrayio.pcl_format.write(M_c, f) f.close() elif normalize == "sum_of_squares": cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-ng", "-u", outfile, ] parallel.sshell(cmd) metadata["command"] = cmd outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import config in_filename = in_data.identifier filelib.assert_exists_nz(in_filename) vcftools = filelib.which_assert(config.vcftools) # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all # --out test32 # Writes stuff to console. Should capture in log file. # Saves file test32.recode.vcf p, f = os.path.split(in_filename) s, ext = os.path.splitext(in_filename) sample = s out_stem = "%s.filtered" % sample log_filename = "%s.log" % sample # Should create file <out_stem>.recode.vcf outfile = "%s.recode.vcf" % out_stem sq = parallel.quote cmd = [ sq(vcftools), "--vcf", sq(in_filename), "--remove-indels", "--recode", "--recode-INFO-all", "--out", out_stem, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, log_filename) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) shutil.copy2(outfile, out_filename)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib samtools = filelib.which_assert(config.samtools) ref = alignlib.standardize_reference_genome(in_data.identifier, out_path, use_symlinks=True) ## fa_filenames = module_utils.find_fasta_files(out_path) ## # Filter out the FASTA files created by RSEM indexing. ## # <assembly>.idx.fa ## # <assembly>.n2g.idx.fa ## # <assembly>.transcripts.fa ## # Could these end with ".fasta"? ## x = fa_filenames ## x = [x for x in x if not x.endswith(".idx.fa")] ## x = [x for x in x if not x.endswith(".n2g.idx.fa")] ## x = [x for x in x if not x.endswith(".transcripts.fa")] ## fa_filenames = x ## assert fa_filenames, "Could not find reference genome." ## assert len(fa_filenames) == 1, "Found multiple reference genomes." ## reference_filename = fa_filenames[0] # samtools faidx <ref>.fa # Makes files: # <ref>.fa.fai sq = parallel.quote cmd = [ sq(samtools), "faidx", sq(ref.fasta_file_full), ] parallel.sshell(cmd, path=out_path) # Check to make sure index was created successfully. f = "%s.fai" % ref.fasta_file_full assert filelib.exists_nz(f)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): from genomicode import filelib from genomicode import parallel from genomicode import alignlib ref_node, gene_node = antecedents ref = alignlib.standardize_reference_genome(ref_node.identifier, out_path, use_symlinks=True) filelib.safe_mkdir(out_path) x = alignlib.make_STAR_index_command(ref.fasta_file_full, out_path, gtf_file=gene_node.identifier, num_cores=num_cores) x = "%s >& out.txt" % x parallel.sshell(x, path=out_path) # Check to make sure index was created successfully. alignlib.assert_is_STAR_reference(out_path)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} lineplot = mlib.get_config("lineplot", which_assert_file=True) gene_names = [ "ACTB", 60, # Human beta actin. "TUBB", 203068, # Human beta tubulin. "Actb", 22461, # Mouse beta actin. "Tubb4a", 22153, # Mouse beta tubulin. ] infile = in_data.identifier sq = parallel.quote cmd = [ sq(lineplot), "--gene_names", ",".join(map(str, gene_names)), "--mar_bottom", 1.50, sq(infile), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} #M = arrayio.read(in_data.identifier) #data = jmath.transpose(M._X) #tickname = M._col_names['_SAMPLE_NAME'] #fig = mplgraph.boxplot( # data, # xlabel='Sample Name', # ylabel='Signal', # title='Signal Intensity', # box_label=tickname) #fig.savefig(outfile) boxplot = mlib.get_config("boxplot", which_assert_file=True) sq = parallel.quote cmd = [ sq(boxplot), sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def get_bedtools_version(): import re from genomicode import config from genomicode import filelib from genomicode import parallel bedtools = filelib.which_assert(config.bedtools) x = parallel.sshell("%s --version" % bedtools, ignore_nonzero_exit=True) x = x.strip() # bedtools v2.23.0 # Version: 1.2 (using htslib 1.2.1) m = re.search(r"v([\w\. ]+)", x) assert m, "Missing version string" return m.group(1)
def _make_analysis_directory(analysis_path, config_file, reference_fa, normal_bam, tumor_bam): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filelib.assert_exists_nz(config_file) filelib.assert_exists_nz(reference_fa) filelib.assert_exists_nz(normal_bam) filelib.assert_exists_nz(tumor_bam) strelka_path = mlib.get_config("strelka", assert_exists=True) config_pl = os.path.join(strelka_path, "bin", "configureStrelkaWorkflow.pl") filelib.assert_exists_nz(config_pl) # $STRELKA/bin/configureStrelkaWorkflow.pl \ # --normal=../test31.bam --tumor=../test32.bam \ # --ref=../genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # --config=./config.ini --output-dir=./myAnalysis sq = mlib.sq cmd = [ sq(config_pl), "--normal", sq(normal_bam), "--tumor", sq(tumor_bam), "--ref", sq(reference_fa), "--config", sq(config_file), "--output-dir", sq(analysis_path), ] cmd = " ".join(cmd) parallel.sshell(cmd)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils mvcf_node = in_data in_filename = mvcf_node.identifier filelib.assert_exists_nz(in_filename) buildver = module_utils.get_user_option(user_options, "buildver", allowed_values=["hg19"], not_empty=True) # Annovar takes a filestem, without the ".vcf". p, f = os.path.split(in_filename) f, exp = os.path.splitext(f) log_filename = "%s.log" % f p, f = os.path.split(out_filename) f, exp = os.path.splitext(f) out_filestem = f cmd = alignlib.make_annovar_command(in_filename, log_filename, out_filestem, buildver) parallel.sshell(cmd) # Make sure the analysis completed successfully. x = "%s.%s_multianno.vcf" % (out_filestem, buildver) filelib.assert_exists_nz(x) if os.path.realpath(x) != os.path.realpath(out_filename): shutil.copy2(x, out_filename)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} center_alg = { 'mean': 'a', 'median': 'm', } assert "gene_center" in out_attributes center = out_attributes['gene_center'] assert center in center_alg, "Invalid center option: %s" % center center_parameter = center_alg[center] cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-cg", center_parameter, "-u", outfile, ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) return metadata
def _run_filterRadia_with_restart(cmd, cancer_sample, chrom, logfile): # Sometimes samtools crashes in the middle of a run. Detect this # case, and re-run the analysis if needed. from genomicode import parallel from genomicode import filelib num_tries = 0 while num_tries <= 3: num_tries += 1 parallel.sshell(cmd, ignore_nonzero_exit=True) filelib.assert_exists(logfile) log = open(logfile).read() # Empty logfile means cmd completed successfully. if not log.strip(): break # Look for evidence that samtools died. If this occurs, try again. # 06/29/2016 09:57:16 AM ERROR The return code of '1' from the # following filter command indicates an error. # 06/29/2016 09:57:16 AM ERROR Error from /usr/bin/python # /usr/local/radia/scripts/createBlatFile.pyc 196C-lung2 # radia2.tmp/196C-lung2_dnaFiltered_chr1.vcf # radia2.tmp/196C-lung2_mpileup_rna_origin_chr1.vcf # -o radia2.tmp/196C-lung2_blatInput_chr1.fa # --allVCFCalls --blatRnaNormalReads --blatRnaTumorReads: # <Traceback> # [...] # samtoolsCall.kill() # [...] # OSError: [Errno 3] No such process if log.find("samtoolsCall.kill") >= 0 \ and log.find("No such process") >= 0: continue # Otherwise, the process failed for some other reason. Raise # an exception. raise AssertionError, "Problem filtering: %s %s\n%s" % (cancer_sample, chrom, log)
def main(): import os import argparse from genomicode import filelib from genomicode import parallel p = filelib.tswrite parser = argparse.ArgumentParser(description="") parser.add_argument("treatment_bam", help="BAM file of treated sample.") parser.add_argument("control_bam", help="BAM file of background sample.") parser.add_argument("outpath", help="Directory to store the results.") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument("--fdr_cutoff", default=0.05, type=float, help="") args = parser.parse_args() filelib.assert_exists_nz(args.treatment_bam) filelib.assert_exists_nz(args.control_bam) args.treatment_bam = os.path.realpath(args.treatment_bam) args.control_bam = os.path.realpath(args.control_bam) assert args.num_procs >= 1 and args.num_procs < 100, \ "Please specify between 1 and 100 processes." assert args.fdr_cutoff > 0.0 and args.fdr_cutoff < 1.0 # Set up directories to run it on. p("Setting up directories.\n") if not os.path.exists(args.outpath): os.mkdir(args.outpath) # Run SPP. p("Running spp in %s.\n" % args.outpath) sq = parallel.quote sppscript = find_sppscript() x = sq(args.treatment_bam), sq(args.control_bam), args.fdr_cutoff, \ args.num_procs x = " ".join(map(str, x)) cmd = "cat %s | R --vanilla %s" % (sppscript, x) x = parallel.sshell(cmd, path=args.outpath) print x p("Done.\n")
def get_paired_orientation_rseqc(reference_bed, bam_filename): from genomicode import alignlib from genomicode import filelib from genomicode import parallel from Betsy import module_lib as mlib script = alignlib.find_rseqc_script("infer_experiment.py") filelib.assert_exists_nz(reference_bed) filelib.assert_exists_nz(bam_filename) # RSeQC scripts use #!/usr/bin/python, which may not be the right # one. Use the python on the path. cmd = [ "python", mlib.sq(script), "-r", mlib.sq(reference_bed) "-i", mlib.sq(bam_filename) ] cmd = " ".join(cmd) x = parallel.sshell(cmd) print x import sys; sys.exit(0)
def _make_intervallist_file(intervallist_file, features_bed, bam_filename): from genomicode import config from genomicode import filelib from genomicode import parallel outhandle = open(intervallist_file, 'w') # Add the @HD and @SQ headers from the bam file. # samtools view -H <filename> samtools = filelib.which_assert(config.samtools) sq = parallel.quote cmd = [ sq(samtools), "view", "-H", sq(bam_filename), ] cmd = " ".join(cmd) x = parallel.sshell(cmd) lines = x.split("\n") lines = [x.rstrip() for x in lines] for line in lines: if line.startswith("@HD") or line.startswith("@SQ"): print >> outhandle, line # Add the information from the BAM files. # BED chrom chromStart (0-based) chromEnd name score strand # Interval chrom chromStart (1-based) chromEnd strand name for cols in filelib.read_cols(features_bed): assert len(cols) >= 6 chrom, chromStart0, chromEnd, name, score, strand = cols[:6] chromStart0, chromEnd = int(chromStart0), int(chromEnd) chromStart1 = chromStart0 + 1 x = chrom, chromStart1, chromEnd, strand, name print >> outhandle, "\t".join(map(str, x)) outhandle.close()
def list_snpeff_databases(): import os import StringIO from genomicode import parallel from genomicode import filelib from Betsy import module_utils as mlib path = mlib.get_config("snp_eff_path", which_assert_file=True) snpeff = os.path.join(path, "snpEff.jar") filelib.assert_exists_nz(snpeff) # Genome Organism Status Bundle Database download link # ------ -------- ------ ------ ---------------------- sq = parallel.quote cmd = [ "java", "-Xmx16g", "-jar", sq(snpeff), "databases", ] output = parallel.sshell(cmd) header = i_db = None databases = [] for cols in filelib.read_cols(StringIO.StringIO(output)): cols = [x.strip() for x in cols] if header is None: header = cols assert "Genome" in header i_db = header.index("Genome") continue assert len(cols) == len(header) if cols[0].startswith("---"): continue db_name = cols[i_db] databases.append(db_name) return databases
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) shiftsize = module_utils.get_user_option(user_options, "macs_shiftsize") if shiftsize: shiftsize = int(shiftsize) # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = find_bam_file(bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs14_command(treat_filename, control_filename, name=name, genome_size=genome_size, shiftsize=shiftsize, save_bedgraph_file=True) parallel.sshell(cmd, path=out_path) # Run Rscript on the model, if one was generated. model_file = os.path.join(out_path, "%s_model.r" % name) if os.path.exists(model_file): Rscript = filelib.which_assert(config.Rscript) cmd = [parallel.quote(Rscript), model_file] parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, "%s_summits.bed" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def plot_heatmap(filename, outfile, cluster_files, user_options): from genomicode import parallel from genomicode import graphlib from Betsy import module_utils as mlib python = mlib.get_config( "python", which_assert_file=True, assert_exists=True) arrayplot = mlib.get_config( "arrayplot", which_assert_file=True, assert_exists=True) COLORS = [ "red", "white", "red-green", "blue-yellow", "red-green-soft", "red-blue-soft", "matlab", "bild", "genepattern", "genespring", "yahoo", "brewer-prgn-div", "brewer-rdbu-div", "brewer-rdylbu-div", "brewer-rdylgn-div", "brewer-spectral-div", "brewer-blues-seq", "brewer-greens-seq", "brewer-reds-seq", "brewer-ylorbr-seq", "brewer-qual-set", ] YESNO = ["no", "yes"] hm_width = mlib.get_user_option(user_options, "hm_width", type=int) hm_height = mlib.get_user_option(user_options, "hm_height", type=int) hm_color = mlib.get_user_option( user_options, "hm_color", allowed_values=COLORS, not_empty=True) hm_colorbar = mlib.get_user_option( user_options, "hm_colorbar", not_empty=True, allowed_values=YESNO) hm_colorbar_horizontal = mlib.get_user_option( user_options, "hm_colorbar_horizontal", not_empty=True, allowed_values=YESNO) hm_colorbar_height = mlib.get_user_option( user_options, "hm_colorbar_height", not_empty=True, type=float) hm_colorbar_width = mlib.get_user_option( user_options, "hm_colorbar_width", not_empty=True, type=float) hm_colorbar_font = mlib.get_user_option( user_options, "hm_colorbar_font", not_empty=True, type=float) hm_label_genes = mlib.get_user_option( user_options, "hm_label_genes", allowed_values=YESNO) hm_scale_gene_labels = mlib.get_user_option( user_options, "hm_scale_gene_labels", not_empty=True, type=float) hm_label_arrays = mlib.get_user_option( user_options, "hm_label_arrays", allowed_values=YESNO) hm_scale_array_labels = mlib.get_user_option( user_options, "hm_scale_array_labels", not_empty=True, type=float) hm_show_gene_tree = None hm_show_array_tree = None hm_show_gene_cluster = None hm_show_array_cluster = None if "hm_show_gene_tree" in user_options: hm_show_gene_tree = mlib.get_user_option( user_options, "hm_show_gene_tree", allowed_values=YESNO, not_empty=True) hm_show_array_tree = mlib.get_user_option( user_options, "hm_show_array_tree", allowed_values=YESNO, not_empty=True) hm_show_gene_cluster = mlib.get_user_option( user_options, "hm_show_gene_cluster", allowed_values=YESNO, not_empty=True) hm_show_array_cluster = mlib.get_user_option( user_options, "hm_show_array_cluster", allowed_values=YESNO, not_empty=True) # Set default values. if not hm_width or not hm_height: nrow, ncol = get_matrix_size(filename) fn = graphlib.find_wide_heatmap_size if nrow > ncol: fn = graphlib.find_tall_heatmap_size x = fn( nrow, ncol, max_total_height=4096, max_total_width=4096, max_box_height=200, max_box_width=200) hm_width, hm_height = x if not hm_label_genes: nrow, ncol = get_matrix_size(filename) hm_label_genes = "no" if nrow <= 50: hm_label_genes = "yes" if not hm_label_arrays: nrow, ncol = get_matrix_size(filename) hm_label_arrays = "no" if ncol <= 50: hm_label_arrays = "yes" # Check values. assert hm_width >= 1 and hm_width <= 256, "Invalid width: %s" % hm_width assert hm_height >= 1 and hm_height <= 256, \ "Invalid height: %s" % hm_height assert hm_scale_gene_labels > 0 and hm_scale_gene_labels < 10 assert hm_scale_array_labels > 0 and hm_scale_array_labels < 10 sq = parallel.quote cmd = [ sq(python), sq(arrayplot), "--grid", "-x", hm_width, "-y", hm_height, "--color", hm_color, ] if hm_colorbar == "yes": cmd += [ "--colorbar", "--cb_height", hm_colorbar_height, "--cb_width", hm_colorbar_width, "--cb_font", hm_colorbar_font, ] if hm_colorbar_horizontal == "yes": cmd += ["--cb_horizontal"] if hm_label_genes == "yes": cmd += [ "--label_genes", "--scale_gene_labels", hm_scale_gene_labels, ] if hm_label_arrays == "yes": cmd += [ "--label_arrays", "--scale_array_labels", hm_scale_array_labels, ] if hm_show_gene_tree == "yes" and "gtr" in cluster_files: cmd += ["--gene_tree_file", cluster_files["gtr"]] if hm_show_array_tree == "yes" and "atr" in cluster_files: cmd += ["--array_tree_file", cluster_files["atr"]] if hm_show_gene_cluster == "yes" and "kgg" in cluster_files: cmd += ["--gene_cluster_file", cluster_files["kgg"]] if hm_show_array_cluster == "yes" and "kag" in cluster_files: cmd += ["--array_cluster_file", cluster_files["kag"]] cmd += [ sq(filename), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) return cmd
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): #import subprocess import shutil import arrayio #from genomicode import config from genomicode import arrayplatformlib from genomicode import parallel #from genomicode import filelib from Betsy import module_utils as mlib DATA = arrayio.read(in_data.identifier) #chipname = arrayplatformlib.identify_platform_of_matrix(DATA) scores = arrayplatformlib.score_matrix(DATA) assert scores, "Unable to identify platform: %s" % in_data.identifier chipname = scores[0] platform = "HG_U133A" assert arrayplatformlib.get_bm_attribute(platform), \ "Unrecognized platform: %s" % platform if chipname == platform: shutil.copyfile(in_data.identifier, outfile) else: Annot_BIN = mlib.get_config("annotate_matrix", which_assert_file=True) sq = parallel.quote cmd = [ "python", sq(Annot_BIN), sq(in_data.identifier), "--platform", sq(platform), '--min_match_score', 0.80, ] cmd = " ".join(map(str, cmd)) cmd = "%s >& %s" % (cmd, sq(outfile)) parallel.sshell(cmd) #f = file(outfile, 'w') #try: # process = subprocess.Popen( # command, shell=False, stdout=f, stderr=subprocess.PIPE) #finally: # f.close() #error_message = process.communicate()[1] #if error_message: # raise ValueError(error_message) #change the HG_U133A to the first column f = file(outfile, 'r') txt = f.readlines() f.close() header = txt[0].split('\t') index = header.index('HG_U133A') f = file(outfile, 'w') for line in txt: line = line.split('\t') newline = [line[index]] + line[0:index] + line[index + 1:] f.write('\t'.join(newline)) f.close()