def relabel(data_file, rename_file, outfile, user_options): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib sample_header = mlib.get_user_option( user_options, "sample_labels_header", not_empty=True) # Make sure sample_header in rename file. x = open(rename_file).readline() x = x.rstrip("\r\n").split("\t") assert sample_header in x, "Missing header (%s): %s" % ( sample_header, rename_file) sq = parallel.quote slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True) x = "'%s,%s'" % (rename_file, sample_header) cmd = [ "python", sq(slice_matrix), '--relabel_col_ids', x, sq(data_file), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, outfile) parallel.sshell(cmd) filelib.assert_exists_nz(outfile) return cmd
def count_duplicates(bam_filename): # Return a tuple of (total_reads, duplicated_reads). import subprocess from genomicode import samtools from Betsy import module_utils as mlib samtools_bin = mlib.get_config("samtools", which_assert_file=True) cmd = [ samtools_bin, "view", bam_filename, ] total = num_dup = 0 p = subprocess.Popen( cmd, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) w, r = p.stdin, p.stdout w.close() for align in samtools.parse_sam(r): if align.flag & 0x400: num_dup += 1 total += 1 return total, num_dup
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): """preprocess the inputfile with RMA using preprocess.py will generate a output file""" import os import subprocess from Betsy import module_utils as mlib in_data = antecedents #preprocess the cel file to text signal file PREPROCESS_BIN = mlib.get_config("preprocess", which_assert_file=True) #PREPROCESS_path = config.preprocess #PREPROCESS_BIN = filelib.which(PREPROCESS_path) #assert PREPROCESS_BIN, 'cannot find the %s' % PREPROCESS_path command = ['python', PREPROCESS_BIN, 'RMA', in_data.identifier] process = subprocess.Popen( command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) error_message = process.communicate()[1] if error_message: if not "Loading required package: Biobase" in error_message: raise ValueError(error_message) outputfiles = os.listdir(".") outputfile = None for i in outputfiles: if i.endswith('.rma'): outputfile = i assert outputfile, "No output file created." os.rename(outputfile, outfile)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import sortlib from Betsy import module_utils as mlib # Should be a folder of fastqc results. fastqc_path = in_data.identifier # Find all the FASTQC results. x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt") x = [os.path.split(x)[0] for x in x] paths = x assert paths, "No FASTQC files found." # Read the results. all_results = [read_fastqc_results(x) for x in paths] assert all_results # Make table where the rows are the samples and the columns # are the statistics. sample2results = {} for x in all_results: assert x.sample not in sample2results sample2results[x.sample] = x all_statistics = all_results[0].statistics_order all_samples = sortlib.sort_natural(sample2results) table = [] header = [ "Sample", "Total Sequences", "Filtered Sequences", "Sequence length", "GC" ] + all_statistics table.append(header) for sample in all_samples: results = sample2results[sample] x1 = [sample] x2 = [ results.total_sequences, results.filtered_sequences, results.sequence_length, results.percent_gc ] x3 = [results.statistics[x] for x in all_statistics] x = x1 + x2 + x3 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "fastqc_summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(map(str, x)) handle.close() x = mlib.get_config("txt2xls", which_assert_file=True, quote=True) os.system("%s -b %s > %s" % (x, TXT_FILE, outfile)) filelib.assert_exists_nz(outfile)
def make_gsea_command(expression_file, class_label_file, gsea_path, name1, name2, indexes1, indexes2, permutation_type, database): # indexes should be 1-based, not including headers. from genomicode import parallel from genomicode import filelib from genomicode import parselib from Betsy import module_utils as mlib from Betsy.rules import GSEAAnalysis filelib.assert_exists_nz(expression_file) filelib.assert_exists_nz(class_label_file) assert permutation_type in GSEAAnalysis.GSEA_PERMUTATION assert database in GSEAAnalysis.GSEA_DATABASE ranges1 = [(i, i + 1) for i in indexes1] ranges2 = [(i, i + 1) for i in indexes2] indexes1_str = parselib.unparse_ranges(ranges1) indexes2_str = parselib.unparse_ranges(ranges2) gsea = mlib.get_config("gsea", which_assert_file=True) sq = parallel.quote cmd = [ sq(gsea), "--name1", name1, "--name2", name2, "--indexes1", indexes1_str, "--indexes2", indexes2_str, "--permutation_type", sq(permutation_type), "--database", sq(database), "--min_match_score", 0.80, "--clobber", sq(expression_file), sq(gsea_path), ] cmd = " ".join(map(str, cmd)) return cmd
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import jmath from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} norm_para = ["variance", "sum_of_squares"] assert "gene_normalize" in out_attributes normalize = out_attributes["gene_normalize"] assert normalize in norm_para, \ "Invalid normalize option: %s" % normalize if normalize == "variance": f = file(outfile, 'w') M = arrayio.read(in_data.identifier, format=arrayio.pcl_format) M_n = jmath.safe_norm_mv(M.slice()) M._X = M_n M_c = arrayio.convert(M, to_format=arrayio.pcl_format) arrayio.pcl_format.write(M_c, f) f.close() elif normalize == "sum_of_squares": cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-ng", "-u", outfile, ] parallel.sshell(cmd) metadata["command"] = cmd outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} lineplot = mlib.get_config("lineplot", which_assert_file=True) gene_names = [ "ACTB", 60, # Human beta actin. "TUBB", 203068, # Human beta tubulin. "Actb", 22461, # Mouse beta actin. "Tubb4a", 22153, # Mouse beta tubulin. ] infile = in_data.identifier sq = parallel.quote cmd = [ sq(lineplot), "--gene_names", ",".join(map(str, gene_names)), "--mar_bottom", 1.50, sq(infile), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def _make_config_file(config_filename, skip_depth_filter=False): import os from genomicode import filelib from Betsy import module_utils as mlib strelka_path = mlib.get_config("strelka", assert_exists=True) src_config = os.path.join(strelka_path, "etc", "strelka_config_bwa_default.ini") filelib.exists_nz(src_config) lines = open(src_config).readlines() assert lines # Edit configure options. for i in range(len(lines)): x = lines[i] x = x.strip() line = x # Make sure skip_depth_filter is correct. # isSkipDepthFilters should be set to 1 to skip depth # filtration for whole exome or other targeted sequencing data # # sSkipDepthFilters = 0 if line.startswith("isSkipDepthFilters"): # isSkipDepthFilters = 0 x = line.split() assert len(x) == 3 assert x[1] == "=" assert x[2] in ["0", "1"] if skip_depth_filter: x[2] = "1" else: x[2] = "0" line = " ".join(x) lines[i] = line lines = [x + "\n" for x in lines] # replace newline that was stripped. open(config_filename, 'w').writelines(lines)
def make_snpeff_command(in_file, genome, out_file, log_file, is_cancer=False, cancer_samples_file=None): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib if is_cancer: filelib.assert_exists_nz(cancer_samples_file) path = mlib.get_config("snp_eff_path", which_assert_file=True) snpeff = os.path.join(path, "snpEff.jar") filelib.assert_exists_nz(snpeff) sq = parallel.quote cmd = [ "java", "-Xmx16g", "-jar", sq(snpeff), ] if is_cancer: cmd += [ "-cancer", "-cancerSamples", sq(cancer_samples_file), ] cmd += [ sq(genome), sq(in_file), ] cmd = " ".join(cmd) cmd = "%s 1> %s 2> %s" % (cmd, sq(out_file), sq(log_file)) return cmd
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib in_data = antecedents metadata = {} #M = arrayio.read(in_data.identifier) #data = jmath.transpose(M._X) #tickname = M._col_names['_SAMPLE_NAME'] #fig = mplgraph.boxplot( # data, # xlabel='Sample Name', # ylabel='Signal', # title='Signal Intensity', # box_label=tickname) #fig.savefig(outfile) boxplot = mlib.get_config("boxplot", which_assert_file=True) sq = parallel.quote cmd = [ sq(boxplot), sq(in_data.identifier), sq(outfile), ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] filelib.assert_exists_nz(outfile) return metadata
def list_snpeff_databases(): import os import StringIO from genomicode import parallel from genomicode import filelib from Betsy import module_utils as mlib path = mlib.get_config("snp_eff_path", which_assert_file=True) snpeff = os.path.join(path, "snpEff.jar") filelib.assert_exists_nz(snpeff) # Genome Organism Status Bundle Database download link # ------ -------- ------ ------ ---------------------- sq = parallel.quote cmd = [ "java", "-Xmx16g", "-jar", sq(snpeff), "databases", ] output = parallel.sshell(cmd) header = i_db = None databases = [] for cols in filelib.read_cols(StringIO.StringIO(output)): cols = [x.strip() for x in cols] if header is None: header = cols assert "Genome" in header i_db = header.index("Genome") continue assert len(cols) == len(header) if cols[0].startswith("---"): continue db_name = cols[i_db] databases.append(db_name) return databases
def _make_analysis_directory(analysis_path, config_file, reference_fa, normal_bam, tumor_bam): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filelib.assert_exists_nz(config_file) filelib.assert_exists_nz(reference_fa) filelib.assert_exists_nz(normal_bam) filelib.assert_exists_nz(tumor_bam) strelka_path = mlib.get_config("strelka", assert_exists=True) config_pl = os.path.join(strelka_path, "bin", "configureStrelkaWorkflow.pl") filelib.assert_exists_nz(config_pl) # $STRELKA/bin/configureStrelkaWorkflow.pl \ # --normal=../test31.bam --tumor=../test32.bam \ # --ref=../genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # --config=./config.ini --output-dir=./myAnalysis sq = mlib.sq cmd = [ sq(config_pl), "--normal", sq(normal_bam), "--tumor", sq(tumor_bam), "--ref", sq(reference_fa), "--config", sq(config_file), "--output-dir", sq(analysis_path), ] cmd = " ".join(cmd) parallel.sshell(cmd)
def run( self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib metadata = {} center_alg = { 'mean': 'a', 'median': 'm', } assert "gene_center" in out_attributes center = out_attributes['gene_center'] assert center in center_alg, "Invalid center option: %s" % center center_parameter = center_alg[center] cluster = mlib.get_config("cluster", which_assert_file=True) sq = parallel.quote cmd = [ sq(cluster), "-f", sq(in_data.identifier), "-cg", center_parameter, "-u", outfile, ] cmd = " ".join(map(str, cmd)) parallel.sshell(cmd) metadata["commands"] = [cmd] outputfile = outfile + '.nrm' filelib.assert_exists_nz(outputfile) os.rename(outputfile, outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os import shutil #from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib ref_node, gene_node = antecedents # Don't copy the whole path. Just get the fasta file. #ref = alignlib.standardize_reference_genome( # ref_node.identifier, out_path, use_symlinks=True) ref = alignlib.create_reference_genome(ref_node.identifier) gtf_file = gene_node.identifier filelib.assert_exists_nz(gtf_file) # Symlink the fasta file into the out path. filelib.safe_mkdir(out_path) x = os.path.join(out_path, ref.fasta_file) os.symlink(ref.fasta_file_full, x) # rsem-prepare-reference --bowtie --bowtie2 --gtf gtf02.gtf # <reference.fa> <reference_name> # <reference_name>.[1234].ebwt # Bowtie1. # <reference_name>.rev.[12].ebwt # <reference_name>.[1234].bt2 # Bowtie2. # <reference_name>.rev.[12].bt2 # <reference_name>.chrlist # RSEM. # <reference_name>.grp # <reference_name>.idx.fa # <reference_name>.n2g.idx.fa # <reference_name>.seq # <reference_name>.ti # <reference_name>.transcripts.fa # chrLength.txt # STAR # chrNameLength.txt # chrName.txt # chrStart.txt # exonGeTrInfo.tab # exonInfo.tab # gencode.vM8.annotation.gtf # geneInfo.tab # Genome # genomeParameters.txt # SA # SAindex # sjdbInfo.txt # sjdbList.fromGTF.out.tab # sjdbList.out.tab # transcriptInfo.tab rsem_prepare = mlib.get_config("rsem_prepare", which_assert_file=True) bowtie = mlib.get_config("bowtie", which_assert_file=True) bowtie2 = mlib.get_config("bowtie2", which_assert_file=True) STAR = mlib.get_config("STAR", which_assert_file=True) # RSEM wants the path that contains the executables. bowtie = os.path.split(bowtie)[0] bowtie2 = os.path.split(bowtie2)[0] STAR = os.path.split(STAR)[0] sq = parallel.quote cmd = [ sq(rsem_prepare), "--num-threads", num_cores, "--bowtie", "--bowtie-path", sq(bowtie), "--bowtie2", "--bowtie2-path", sq(bowtie2), "--star", "--star-path", sq(STAR), "--gtf", sq(gtf_file), sq(ref.fasta_file_full), ref.name, ] parallel.sshell(cmd, path=out_path) # Copy the GTF file into the output path. shutil.copy2(gtf_file, out_path) assembly = ref.name # Check to make sure index was created successfully. x1 = ["%s.%d.ebwt" % (assembly, i+1) for i in range(4)] x2 = ["%s.rev.%d.ebwt" % (assembly, i+1) for i in range(2)] x3 = ["%s.%d.bt2" % (assembly, i+1) for i in range(4)] x4 = ["%s.rev.%d.bt2" % (assembly, i+1) for i in range(2)] x5 = [ "%s.chrlist" % assembly, "%s.grp" % assembly, "%s.idx.fa" % assembly, "%s.n2g.idx.fa" % assembly, "%s.seq" % assembly, "%s.ti" % assembly, "%s.transcripts.fa" % assembly, ] x6 = [ "chrLength.txt", "chrNameLength.txt", "chrName.txt", "chrStart.txt", "exonGeTrInfo.tab", "exonInfo.tab", "gencode.vM8.annotation.gtf", "geneInfo.tab", "Genome", "genomeParameters.txt", "SA", "SAindex", "sjdbInfo.txt", "sjdbList.fromGTF.out.tab", "sjdbList.out.tab", "transcriptInfo.tab", ] x = x1 + x2 + x3 + x4 + x5 + x6 index_files = [os.path.join(out_path, x) for x in x] filelib.assert_exists_nz_many(index_files)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile, # vcf_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile jobs.append(x) # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # test31/tumor.bam test31/normal.bam test41.vcf somaticsniper = mlib.get_config("somaticsniper", which_assert_file=True) # Generate the commands. sq = mlib.sq commands = [] for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x x = [ sq(somaticsniper), "-q", 1, "-Q", 15, "-G", "-L", "-F", "vcf", "-f", sq(ref.fasta_file_full), sq(cancer_bamfile), sq(normal_bamfile), sq(vcf_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # SomaticSniper names the samples "NORMAL" and "TUMOR". # Replace them with the actual names. for x in jobs: normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \ vcf_outfile = x call_somatic_varscan._fix_normal_cancer_names( vcf_outfile, normal_sample, cancer_sample) x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import StringIO import arrayio from genomicode import arrayplatformlib from genomicode import parallel from genomicode import filelib from genomicode import AnnotationMatrix from Betsy import module_utils as mlib M = arrayio.read(in_data.identifier) metadata = {} # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION. Figure out which # platforms provide each one of this. CATEGORIES = [ arrayplatformlib.GENE_ID, arrayplatformlib.GENE_SYMBOL, # biomaRt doesn't convert description. So just ignore it # for now. # TODO: implement DESCRIPTION. #arrayplatformlib.DESCRIPTION, ] #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M) #assert all_platforms, "Unknown platform: %s" % in_data.identifier #header, platform_name = all_platforms[0] scores = arrayplatformlib.score_matrix(M) scores = [x for x in scores if x.max_score >= 0.75] assert scores, "I could not identify any platforms." # Find all the platforms not in the matrix. platforms = [ arrayplatformlib.find_platform_by_name(x.platform_name) for x in scores ] categories = [x.category for x in platforms] missing = [x for x in CATEGORIES if x not in categories] score = scores[0] platform = platforms[0] to_add = [] # list of platform names for category in missing: x = arrayplatformlib.PLATFORMS x = [x for x in x if x.category == category] x = [x for x in x if x.bm_organism == platform.bm_organism] x = [x for x in x if x.name != score.platform_name] # Take the first one, if any. if x: to_add.append(x[0].name) if to_add: annotate = mlib.get_config("annotate_matrix", which_assert_file=True) sq = parallel.quote cmd = [ "python", sq(annotate), "--no_na", "--header", sq(score.header), ] for x in to_add: x = ["--platform", sq(x)] cmd.extend(x) cmd.append(in_data.identifier) cmd = " ".join(cmd) data = parallel.sshell(cmd) metadata["commands"] = [cmd] assert data.find("Traceback") < 0, data else: data = open(in_data.identifier).read() # Clean up the headers. platform2pretty = { "Entrez_ID_human": "Gene ID", "Entrez_Symbol_human": "Gene Symbol", "Entrez_ID_mouse": "Gene ID", "Entrez_Symbol_mouse": "Gene Symbol", } handle = open(outfile, 'w') header_written = False for cols in filelib.read_cols(StringIO.StringIO(data)): if not header_written: cols = [platform2pretty.get(x, x) for x in cols] cols = AnnotationMatrix.uniquify_headers(cols) header_written = True print >> handle, "\t".join(cols) return metadata