def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} #module_utils.plot_line_keywd(in_data.identifier, 'biotin', outfile) lineplot = mlib.get_config("lineplot", which_assert_file=True) sq = parallel.quote cmd = [ sq(lineplot), "--gene_names", "biotin", "--mar_bottom", 1.50, "--yaxis_starts_at_0", sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def check_log_file(filename): from genomicode import filelib # Log file format: # [Sat Dec 31 19:29:27 CST 2016] picard.sam.AddOrReplaceReadGroups INPUT= # [Sat Dec 31 19:29:27 CST 2016] Executing as [email protected] # INFO 2016-12-31 19:29:27 AddOrReplaceReadGroups Created read gr # INFO 2016-12-31 19:29:42 AddOrReplaceReadGroups Processed 1 # INFO 2016-12-31 19:29:58 AddOrReplaceReadGroups Processed 2 # [...] # [Sat Dec 31 19:48:14 CST 2016] picard.sam.AddOrReplaceReadGroups done. # Runtime.totalMemory()=1609564160 # # Sometimes these lines are interspersed. Probably OK not to flag. # Ignoring SAM validation error: ERROR: Read name HWI-ST1120:331:C6VW5ACX # # ERROR: Sometimes see exceptions. # Exception in thread "main" java.lang.RuntimeException: BGZF file has in # at htsjdk.samtools.util.BlockGunzipper.unzipBlock(BlockGunzippe # [...] # The log file should not be empty. filelib.assert_exists_nz(filename) lines = open(filename).readlines() # Make sure there's no exception. i_exception = None for i in range(len(lines)): if lines[i].startswith("Exception in thread "): i_exception = i break if i_exception is None: return x = "".join(lines[i:]).strip() raise AssertionError, "Exception in Picard output:\n%s" % x
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_node, gene_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) assert bam_filenames, "No bam files found." metadata = {} # Make output filenames. p, r, e = mlib.splitpath(gtf_file) bed_file = "%s.bed" % r # Make bed file. alignlib.gtf_to_bed(gtf_file, bed_file) #bed_file = "/data/jchang/biocore/gtf02.txt" # Figure out the orientation. x = get_paired_stranded_rseqc(bed_file, bam_filenames[0]) single_or_paired, stranded, frac_failed, frac_first, frac_second = x x = mlib.Stranded(single_or_paired, stranded, frac_failed, frac_first, frac_second) mlib.write_stranded(x, outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib data_nodes = [ ("SignalFile", "gene_expression.nonorm.txt"), ("SignalFile", "gene_expression.normalized.txt"), ("SignalDistributionBoxplot", "signal_distribution.png"), ("ActbPlot", "ACTB.nonorm.png"), # beta-actin expression ("ActbPlot", "ACTB.normalized.png"), # beta-actin expression ("PCAPlot", "pca.nonorm.png"), # No normalization ("PCAPlot", "pca.normalized.png"), # Normalized ("Heatmap", "heatmap.nonorm.png"), ("Heatmap", "heatmap.normalized.png"), ] assert len(antecedents) == len(data_nodes) for i, (dtype, outfile) in enumerate(data_nodes): inode = antecedents[i] filelib.assert_exists_nz(inode.identifier) assert inode.data.datatype.name == dtype, "Mismatch: %s %s" % ( inode.data.datatype.name, dtype) if not os.path.exists(out_path): os.mkdir(out_path) metadata = {} # Copy the files over. for i, (dtype, outfile) in enumerate(data_nodes): inode = antecedents[i] outfilename = os.path.join(out_path, outfile) shutil.copy2(inode.identifier, outfilename) return metadata
def relabel(data_file, rename_file, outfile, user_options): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib sample_header = mlib.get_user_option( user_options, "sample_labels_header", not_empty=True) # Make sure sample_header in rename file. x = open(rename_file).readline() x = x.rstrip("\r\n").split("\t") assert sample_header in x, "Missing header (%s): %s" % ( sample_header, rename_file) sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) x = "'%s,%s'" % (rename_file, sample_header) cmd = [ "python", sq(slice_matrix), '--relabel_col_ids', x, sq(data_file), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return cmd
def get_paired_stranded_rseqc(reference_bed, bam_filename): from genomicode import alignlib from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib script = alignlib.find_rseqc_script("infer_experiment.py") filelib.assert_exists_nz(reference_bed) filelib.assert_exists_nz(bam_filename) # RSeQC scripts use #!/usr/bin/python, which may not be the right # one. Use the python on the path. cmd = [ "python", mlib.sq(script), "-r", mlib.sq(reference_bed), "-i", mlib.sq(bam_filename), ] cmd = " ".join(cmd) x = parallel.sshell(cmd) x = parse_rseqc_infer_experiment(x) #single_or_paired, stranded, frac_failed, frac_first, frac_second = x return x
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): """log the input file""" import math import arrayio from genomicode import filelib from genomicode import binreg signal_file = in_data.identifier filelib.assert_exists_nz(signal_file) M = arrayio.read(signal_file) assert not binreg.is_logged_array_data(M), 'the file is logged' # Change the matrix in place. X = M._X for i in range(len(X)): for j in range(len(X[i])): x = X[i][j] if x is None: continue x = float(x) if x < 1: x = 1 x = math.log(x, 2) X[i][j] = x M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format) handle = open(outfile, 'w') arrayio.tab_delimited_format.write(M_c, handle)
def read_stranded(filename): import json from genomicode import filelib filelib.assert_exists_nz(filename) text = open(filename).read() x = json.loads(text) return Stranded(**x)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): # Given a GEOID and GPLID, get the series matrix file. from genomicode import geolib from genomicode import filelib metadata = {} GSEID = user_options['GSEID'] GPLID = user_options.get("GPLID") assert GSEID.startswith('GSE'), 'GSEID %s is not correct' % GSEID assert not GPLID or GPLID.startswith('GPL'), \ 'GPLID %s is not correct' % GPLID # Don't need to save user_options. #metadata["GSEID"] = GSEID #if GPLID: # metadata["GPLID"] = GPLID outhandle = open(outfile, 'w') geolib.download_seriesmatrix_file(outhandle, GSEID, GPLID) outhandle.close() filelib.assert_exists_nz(outfile) #metadata["filesize"] = filelib.filesize(outfile) #if not os.path.exists(outfile): # os.mkdir(outfile) #matrix_files = get_seriesmatrix_file(GSEID, GPLID) #for matrix_file in matrix_files: # newmatrix_filename = os.path.split(matrix_file)[-1] # shutil.copyfile(matrix_file, os.path.join(outfile, newmatrix_filename)) #assert filelib.exists_nz(outfile), ( # 'the output file %s for download_geo_dseriesmatrix fails' % outfile #) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib in_data = antecedents plot_hyb_bar(in_data.identifier, outfile) filelib.assert_exists_nz(outfile)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from genomicode import config signal_node = in_data signal_file = signal_node.identifier assert os.path.exists(signal_file) slice_matrix = filelib.which_assert(config.slice_matrix) sq = parallel.quote cmd = [ sq(slice_matrix), "--cpm", signal_file, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile)
def read_orientation(filename): import json from genomicode import filelib filelib.assert_exists_nz(filename) text = open(filename).read() x = json.loads(text) return Orientation(**x)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib outhandle = open(outfile, 'w') extract_signal(in_data.identifier, outhandle) outhandle.close() filelib.assert_exists_nz(outfile)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import sortlib from Betsy import module_utils as mlib # Should be a folder of fastqc results. fastqc_path = in_data.identifier # Find all the FASTQC results. x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt") x = [os.path.split(x)[0] for x in x] paths = x assert paths, "No FASTQC files found." # Read the results. all_results = [read_fastqc_results(x) for x in paths] assert all_results # Make table where the rows are the samples and the columns # are the statistics. sample2results = {} for x in all_results: assert x.sample not in sample2results sample2results[x.sample] = x all_statistics = all_results[0].statistics_order all_samples = sortlib.sort_natural(sample2results) table = [] header = [ "Sample", "Total Sequences", "Filtered Sequences", "Sequence length", "GC" ] + all_statistics table.append(header) for sample in all_samples: results = sample2results[sample] x1 = [sample] x2 = [ results.total_sequences, results.filtered_sequences, results.sequence_length, results.percent_gc ] x3 = [results.statistics[x] for x in all_statistics] x = x1 + x2 + x3 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "fastqc_summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(map(str, x)) handle.close() x = mlib.get_config("txt2xls", which_assert_file=True, quote=True) os.system("%s -b %s > %s" % (x, TXT_FILE, outfile)) filelib.assert_exists_nz(outfile)
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} ## data_node, cls_node = antecedents ## a, b, c = read_label_file.read(cls_node.identifier) ## if len(a) > 1: ## colors = [] ## for i in range(5): ## colors.append(cm.hot(i / 5.0, 1)) ## colors.append(cm.autumn(i / 5.0, i)) ## colors.append(cm.cool(i / 5.0, i)) ## colors.append(cm.jet(i / 5.0, i)) ## colors.append(cm.spring(i / 5.0, i)) ## colors.append(cm.prism(i / 5.0, i)) ## colors.append(cm.summer(i / 5.0, i)) ## colors.append(cm.winter(i / 5.0, i)) ## opts = [colors[int(i)] for i in b] ## legend = [c[int(i)] for i in b] ## plot_pca(data_node.identifier, outfile, opts, legend) #num_genes = mlib.get_user_option( # user_options, "pca_num_genes", type=int) #assert num_genes >= 5 and num_genes < 1E5 #metadata["num_genes"] = num_genes pcaplot = mlib.get_config("pcaplot", which_assert_file=True) prism_file = "prism.txt" row_pc_file = "row_components.txt" col_pc_file = "col_components.txt" sq = parallel.quote cmd = [ sq(pcaplot), "--label", #"-g", num_genes, "--prism_file", prism_file, "--row_pc_file", row_pc_file, "--col_pc_file", col_pc_file, sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node = antecedents #in_filenames = filelib.list_files_in_path( # bam_node.identifier, endswith=".bam", case_insensitive=True) in_filenames = module_utils.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \ # I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta picard_jar = alignlib.find_picard_jar("picard") jobs = [] # list of (in_filename, out_filename) for in_filename in in_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = in_filename, out_filename jobs.append(x) # Make a list of commands. sq = parallel.quote commands = [] for x in jobs: in_filename, out_filename = x x = [ "java", "-Xmx5g", "-jar", sq(picard_jar), "ReorderSam", "I=%s" % sq(in_filename), "O=%s" % sq(out_filename), "REFERENCE=%s" % ref.fasta_file_full, ] x = " ".join(x) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. for x in jobs: in_filename, out_filename = x filelib.assert_exists_nz(out_filename)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """given a GEOID get the family soft file""" from genomicode import filelib metadata = {} GSEID = user_options['GSEID'] assert GSEID.startswith('GSE'), 'GSEID %s is not correct' % GSEID metadata["GSEID"] = GSEID download_series_family(GSEID, 300, open(outfile, 'w')) filelib.assert_exists_nz(outfile) #metadata["filesize"] = filelib.filesize(outfile) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib metadata = {} cmd = plot_heatmap(in_data.identifier, outfile, {}, user_options) metadata["command"] = cmd #M = arrayio.read(in_data.identifier) #nrow = M.nrow() #ncol = M.ncol() #ratio = float(nrow) / ncol #max_box_height = 20 #max_box_width = 60 #if 'hm_width' in user_options: # max_box_width = user_options['hm_width'] #if 'hm_height' in user_options: # max_box_height = user_options['hm_height'] #if ratio >= 4: # x, y = graphlib.find_tall_heatmap_size( # nrow, ncol, # max_box_height=max_box_height, # max_box_width=max_box_width, # min_box_height=20, # min_box_width=20, # max_megapixels=128) #else: # x, y = graphlib.find_wide_heatmap_size( # nrow, ncol, # max_box_height=max_box_height, # max_box_width=max_box_width, # min_box_height=20, # min_box_width=20, # max_megapixels=128) #command.extend(['-x', str(x), '-y', str(y)]) # #process = subprocess.Popen(command, # shell=False, # stdout=subprocess.PIPE, # stderr=subprocess.PIPE) #error_message = process.communicate()[1] #if error_message: # raise ValueError(error_message) filelib.assert_exists_nz(outfile) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from Betsy import module_utils as mlib from plot_signal_heatmap import plot_heatmap metadata = {} cluster_files = mlib.find_cluster_files(in_data.identifier) assert "cdt" in cluster_files cmd = plot_heatmap( cluster_files["cdt"], outfile, cluster_files, user_options) metadata["command"] = cmd filelib.assert_exists_nz(outfile) return metadata
def main(): import os import argparse from genomicode import filelib from genomicode import parallel p = filelib.tswrite parser = argparse.ArgumentParser(description="") parser.add_argument("treatment_bam", help="BAM file of treated sample.") parser.add_argument("control_bam", help="BAM file of background sample.") parser.add_argument("outpath", help="Directory to store the results.") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument("--fdr_cutoff", default=0.05, type=float, help="") args = parser.parse_args() filelib.assert_exists_nz(args.treatment_bam) filelib.assert_exists_nz(args.control_bam) args.treatment_bam = os.path.realpath(args.treatment_bam) args.control_bam = os.path.realpath(args.control_bam) assert args.num_procs >= 1 and args.num_procs < 100, \ "Please specify between 1 and 100 processes." assert args.fdr_cutoff > 0.0 and args.fdr_cutoff < 1.0 # Set up directories to run it on. p("Setting up directories.\n") if not os.path.exists(args.outpath): os.mkdir(args.outpath) # Run SPP. p("Running spp in %s.\n" % args.outpath) sq = parallel.quote sppscript = find_sppscript() x = sq(args.treatment_bam), sq(args.control_bam), args.fdr_cutoff, \ args.num_procs x = " ".join(map(str, x)) cmd = "cat %s | R --vanilla %s" % (sppscript, x) x = parallel.sshell(cmd, path=args.outpath) print x p("Done.\n")
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from genomicode import filelib from genomicode import parallel from genomicode import arrayplatformlib as apl from Betsy import module_utils as mlib in_data = antecedents metadata = {} M = arrayio.read(in_data.identifier) cat2header = apl.categorize_headers(M) header = cat2header.get(apl.GENE_SYMBOL) if header is None: header = cat2header.get(apl.GENE_ID) assert header is not None, "I could not find gene IDs or symbols: %s" \ % in_data.identifier metadata["dedup_header"] = header slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) sq = parallel.quote algorithm = out_attributes['unique_genes'] if algorithm == "average_genes": raise NotImplementedError elif algorithm == "high_var": dedup_cmd = ["--dedup_row_by_var", sq(header)] pass elif algorithm == "first_gene": raise NotImplementedError else: raise AssertionError, "Unknown algorithm: %s" % algorithm cmd = [ sq(slice_matrix), ] cmd += dedup_cmd cmd += [sq(in_data.identifier)] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib ( bam_node, fastqc_summary1_node, fastqc_folder1_node, fastqc_summary2_node, fastqc_folder2_node, rseqc_node, signal1_node, # TPM signal2_node, # TPM, isoform aligned_reads_node, signal3_node, # count htseq_reads_node) = antecedents filelib.safe_mkdir(out_path) FILES = [ (bam_node.identifier, False, "alignment.bam"), (fastqc_summary1_node.identifier, True, "fastqc.no_trim.xls"), (fastqc_folder1_node.identifier, False, "fastqc.no_trim"), (fastqc_summary2_node.identifier, True, "fastqc.trim.xls"), (fastqc_folder2_node.identifier, False, "fastqc.trim"), (rseqc_node.identifier, False, "RSeQC"), (signal1_node.identifier, True, "expression.gene.tpm"), (signal2_node.identifier, True, "expression.isoform.tpm"), (aligned_reads_node.identifier, True, "aligned.xls"), (signal3_node.identifier, True, "expression.counts"), (htseq_reads_node.identifier, True, "mapped.htseq.txt"), ] for x in FILES: orig_filename, is_file, new_file = x new_filename = os.path.join(out_path, new_file) # Copy or link the data into the right place. if is_file: filelib.assert_exists_nz(orig_filename) else: assert filelib.dir_exists(orig_filename), \ "Directory not found or not directory: %s" % \ orig_filename os.symlink(orig_filename, new_filename)
def make_gsea_command(expression_file, class_label_file, gsea_path, name1, name2, indexes1, indexes2, permutation_type, database): # indexes should be 1-based, not including headers. from genomicode import parallel from genomicode import filelib from genomicode import parselib from Betsy import module_utils as mlib from Betsy.rules import GSEAAnalysis filelib.assert_exists_nz(expression_file) filelib.assert_exists_nz(class_label_file) assert permutation_type in GSEAAnalysis.GSEA_PERMUTATION assert database in GSEAAnalysis.GSEA_DATABASE ranges1 = [(i, i + 1) for i in indexes1] ranges2 = [(i, i + 1) for i in indexes2] indexes1_str = parselib.unparse_ranges(ranges1) indexes2_str = parselib.unparse_ranges(ranges2) gsea = mlib.get_config("gsea", which_assert_file=True) sq = parallel.quote cmd = [ sq(gsea), "--name1", name1, "--name2", name2, "--indexes1", indexes1_str, "--indexes2", indexes2_str, "--permutation_type", sq(permutation_type), "--database", sq(database), "--min_match_score", 0.80, "--clobber", sq(expression_file), sq(gsea_path), ] cmd = " ".join(map(str, cmd)) return cmd
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import jmath from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} norm_para = ["variance", "sum_of_squares"] assert "gene_normalize" in out_attributes normalize = out_attributes["gene_normalize"] assert normalize in norm_para, \ "Invalid normalize option: %s" % normalize if normalize == "variance": f = file(outfile, 'w') M = arrayio.read(in_data.identifier, format=arrayio.pcl_format) M_n = jmath.safe_norm_mv(M.slice()) M._X = M_n M_c = arrayio.convert(M, to_format=arrayio.pcl_format) arrayio.pcl_format.write(M_c, f) f.close() elif normalize == "sum_of_squares": cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-ng", "-u", outfile, ] parallel.sshell(cmd) metadata["command"] = cmd outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): import os import shutil from genomicode import filelib from genomicode import parallel from genomicode import config in_filename = in_data.identifier filelib.assert_exists_nz(in_filename) vcftools = filelib.which_assert(config.vcftools) # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all # --out test32 # Writes stuff to console. Should capture in log file. # Saves file test32.recode.vcf p, f = os.path.split(in_filename) s, ext = os.path.splitext(in_filename) sample = s out_stem = "%s.filtered" % sample log_filename = "%s.log" % sample # Should create file <out_stem>.recode.vcf outfile = "%s.recode.vcf" % out_stem sq = parallel.quote cmd = [ sq(vcftools), "--vcf", sq(in_filename), "--remove-indels", "--recode", "--recode-INFO-all", "--out", out_stem, ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, log_filename) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) shutil.copy2(outfile, out_filename)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import shutil import arrayio from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filename = in_data.identifier filelib.assert_exists_nz(filename) # De-duplicate by every single header. Not sure if this is # right. MATRIX = arrayio.read(filename) # Figure out which columns has duplicates. has_dup = [] for name in MATRIX.row_names(): annots = MATRIX.row_names(name) assert name not in has_dup seen = {} for annot in annots: if annot in seen: has_dup.append(name) break seen[annot] = 1 if not has_dup: shutil.copy2(filename, outfile) return sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) for i, name in enumerate(has_dup): f = "outfile.%d.txt" % i x = [ sq(slice_matrix), "--dedup_row_by_var", sq(name), sq(filename), ">&", sq(f), ] x = " ".join(map(str, x)) parallel.sshell(x) shutil.copy2(f, outfile)
def _make_samtools_filter_cmd(in_bamfile, out_bamfile): from genomicode import filelib from genomicode import parallel from genomicode import config filelib.assert_exists_nz(in_bamfile) samtools = filelib.which_assert(config.samtools) sq = parallel.quote cmd = [ sq(samtools), "view", "-bF 4", sq(in_bamfile), ">", sq(out_bamfile), ] cmd = " ".join(cmd) return cmd
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from genomicode import filelib from Betsy import module_utils in_data = antecedents assert module_utils.is_missing(in_data.identifier), 'no missing values' M = arrayio.read(in_data.identifier) for i in range(M.dim()[0]): for j in range(M.dim()[1]): if M._X[i][j] is None: M._X[i][j] = '0' f_out = file(outfile, 'w') arrayio.tab_delimited_format.write(M, f_out) f_out.close() filelib.assert_exists_nz(outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib import add_coverage_to_simplevariantmatrix simple_node, coverage_node = antecedents filelib.assert_exists_nz(simple_node.identifier) filelib.assert_exists_nz(coverage_node.identifier) # Figure out if I'm adding coverage data from DNA or RNA. #in_attrs = simple_node.data.attributes #out_attrs = out_attributes #name = "with_rna_coverage" #assert name in in_attrs and name in out_attrs #is_rna_cov = False #if in_attrs[name] == "no" and out_attrs[name] == "yes": # is_rna_cov = True add_coverage_to_simplevariantmatrix.add_coverage_to_svm( simple_node.identifier, coverage_node.identifier, out_filename, True)
def _convert_gene_ids_local(in_platform, out_platform): # Return a dictionary of gene_id -> list of converted_ids, or None # if these platforms cannot be converted. import os from genomicode import config from genomicode import filelib filelib.assert_exists_nz(config.convert_platform) x = "%s___%s.txt" % (in_platform, out_platform) filename = os.path.join(config.convert_platform, x) if not os.path.exists(filename): return None in2out = {} for cols in filelib.read_cols(filename): # <in_id> <out_id1> ... <out_idn> assert len(cols) >= 2 in_id = cols[0] out_ids = cols[1:] in2out[in_id] = out_ids return in2out