def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import sortlib from Betsy import module_utils as mlib # Should be a folder of fastqc results. fastqc_path = in_data.identifier # Find all the FASTQC results. x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt") x = [os.path.split(x)[0] for x in x] paths = x assert paths, "No FASTQC files found." # Read the results. all_results = [read_fastqc_results(x) for x in paths] assert all_results # Make table where the rows are the samples and the columns # are the statistics. sample2results = {} for x in all_results: assert x.sample not in sample2results sample2results[x.sample] = x all_statistics = all_results[0].statistics_order all_samples = sortlib.sort_natural(sample2results) table = [] header = [ "Sample", "Total Sequences", "Filtered Sequences", "Sequence length", "GC" ] + all_statistics table.append(header) for sample in all_samples: results = sample2results[sample] x1 = [sample] x2 = [ results.total_sequences, results.filtered_sequences, results.sequence_length, results.percent_gc ] x3 = [results.statistics[x] for x in all_statistics] x = x1 + x2 + x3 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "fastqc_summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(map(str, x)) handle.close() x = mlib.get_config("txt2xls", which_assert_file=True, quote=True) os.system("%s -b %s > %s" % (x, TXT_FILE, outfile)) filelib.assert_exists_nz(outfile)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_filename): from genomicode import filelib vcf_node = in_data # Some callers, like jointsnvmix, will create vcf files for # each chromosome. To avoid picking these up, only accept # .vcf files from the top level. vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", toplevel_only=True) assert vcf_filenames, "No .vcf files: %s" % vcf_node.identifier metadata = {} tmp_path = "indexed.vcf" m = merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path) metadata.update(m) filelib.assert_exists(out_filename) # may be size 0 return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel vcf_node = in_data vcf_files = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", case_insensitive=True) filelib.safe_mkdir(out_path) metadata = {} jobs = [] # in_vcf_filename, out_vcf_filename for vcf_file in vcf_files: path, file_ = os.path.split(vcf_file) out_vcf_file = os.path.join(out_path, file_) x = vcf_file, out_vcf_file jobs.append(x) # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel"] # Generate the commands. commands = [] for x in jobs: in_vcf_file, out_vcf_file = x args = vartype, in_vcf_file, out_vcf_file x = filter_by_vartype, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores x = [x[-1] for x in jobs] filelib.assert_exists_many(x) return metadata
def find_vcf_files(vcf_path): # Return list of (<sample>, <filename>). import os from genomicode import filelib #from genomicode import vcflib filenames = filelib.list_files_in_path(vcf_path, endswith=".vcf", case_insensitive=True) # Format: # <path>/<sample>.vcf vcf_files = [] for filename in filenames: p, f = os.path.split(filename) sample = os.path.splitext(f)[0] #caller = vcflib.identify_caller(filename) #assert caller is not None, "Unknown caller: %s" % filename x = sample, filename vcf_files.append(x) return vcf_files
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from Betsy import module_utils path = module_utils.unzip_if_zip(in_data.identifier) x = filelib.list_files_in_path(path) x = [x for x in x if x.lower().endswith(".idat")] assert x, "No idat files." in_filenames = x if not os.path.exists(out_path): os.mkdir(out_path) for in_filename in in_filenames: in_path, in_file = os.path.split(in_filename) file_, ext = os.path.splitext(in_file) if file_.endswith("_Grn"): file_ = file_[:-4] out_file = "%s%s" % (file_, ext) out_filename = os.path.join(out_path, out_file) shutil.copyfile(in_filename, out_filename)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import vcflib from Betsy import module_utils as mlib vcf_node, nc_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Filenames: # <caller>.vcf wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) genome = mlib.get_user_option(user_options, "snpeff_genome", not_empty=True) databases = list_snpeff_databases() assert genome in databases, "Unknown genome database: %s" % genome # For each caller, do the SnpEFF calls. Some callers include # the somatic information, others do not. If germline samples # are present, then do with _cancer. Otherwise, do not. # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt # GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log # Don't bother annotating positions that do not pass filter. # Filter them out first based on FILTER column. opj = os.path.join jobs = [] for in_filename in vcf_filenames: path, stem, ext = mlib.splitpath(in_filename) samples_file = opj(out_path, "%s.cancerSamples.txt" % stem) filtered_filename = opj(out_path, "%s.filtered_input" % stem) out_filename = opj(out_path, "%s.vcf" % stem) log_filename = opj(out_path, "%s.log" % stem) x = filelib.GenericObject(in_filename=in_filename, samples_file=samples_file, filtered_filename=filtered_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # First, filter each of the VCF files. commands = [] for j in jobs: # For debugging. If this file exists, don't filter it again. if os.path.exists(j.filtered_filename): continue args = j.in_filename, j.filtered_filename, wgs_or_wes x = vcflib.filter_vcf_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Make the cancer_samples files. for j in jobs: # Will generate this if there are cancer samples. make_cancer_samples_file(j.filtered_filename, nc_match, j.samples_file) # Make a list of commands. commands = [] for j in jobs: cancer = False if os.path.exists(j.samples_file): cancer = True x = make_snpeff_command(j.filtered_filename, genome, j.out_filename, j.log_filename, is_cancer=cancer, cancer_samples_file=j.samples_file) commands.append(x) nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_cores"] = nc # Make sure the analysis completed successfully. x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Log files should be empty. for j in jobs: filelib.assert_exists(j.log_filename) assert not filelib.exists_nz(j.log_filename), \ "Error with %s.\n%s" % (j.stem, j.log_filename) filelib.safe_unlink(j.log_filename) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib # This this is I/O heavy, don't use so many cores. Also, # takes 4-5 Gb RAM per process. MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4) fastq_node, sample_node, summary_node = antecedents fastq_path = fastq_node.identifier fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_path) assert fastq_files, "I could not find any FASTQ files." summary_filenames = filelib.list_files_in_path( summary_node.identifier, endswith=".matches.txt") assert summary_filenames, "No .matches.txt files." filelib.safe_mkdir(out_path) metadata = {} num_mismatches = mlib.get_user_option( user_options, "num_mismatches", type=int) assert num_mismatches >= 0 and num_mismatches < 25 metadata["num_mismatches"] = num_mismatches sample2summary = {} # sample -> summary_filename for filename in summary_filenames: # <sample>.matches.txt p, f = os.path.split(filename) assert f.endswith(".matches.txt") sample = f.replace(".matches.txt", "") assert sample not in sample2summary sample2summary[sample] = filename # list of (sample, fastq_file1, fastq_file2, summary_filename, # out_file1, out_file2, subtracted_file1, subtracted_file2) jobs = [] for x in fastq_files: sample, pair1_fastq, pair2_fastq = x assert sample in sample2summary, \ "Missing summary for sample: %s" % sample p1, f1 = os.path.split(pair1_fastq) if pair2_fastq: p2, f2 = os.path.split(pair2_fastq) assert p1 == p2 out1_fastq = os.path.join(out_path, f1) sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1) out2_fastq = None sub2_fastq = None if pair2_fastq: out2_fastq = os.path.join(out_path, f2) sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2) x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq jobs.append(x) jobs2 = [] # list of (function, args, keywds) for x in jobs: sample, pair1_fastq, pair2_fastq, summary_file, \ out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) if pair2_fastq: x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \ num_mismatches x = subtract_mouse_reads, x, {} jobs2.append(x) nc = min(MAX_CORES, num_cores) results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5) assert len(results) == len(jobs2) metadata["num_cores"] = nc # Make sure the fastq files were generated. x1 = [x[4] for x in jobs] x2 = [x[5] for x in jobs] x = x1 + x2 x = [x for x in x if x] # BUG: If all reads were removed, then this will fail incorrectly. filelib.assert_exists_nz_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import alignlib count_path = in_data.identifier assert os.path.exists(count_path) assert os.path.isdir(count_path) result_files = filelib.list_files_in_path(count_path, endswith=".count") assert result_files, "No .count files found." # Parse the count files. name2results = {} for filename in result_files: x = os.path.split(filename)[1] x = os.path.splitext(x)[0] name = x assert name not in name2results x = alignlib.parse_htseq_count_output(filename) name2results[name] = x assert name2results, "No samples" # Assemble into a summary matrix. # Rows: # no_feature # ambiguous # too_low_aQual # not_aligned # alignment_not_unique # total_mapped # total_fragments # percent_mapped ROWS = [ "no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique", ] all_names = sorted(name2results) matrix = [] header = ["Feature"] + all_names matrix.append(header) for rn in ROWS: x = [rn] + [getattr(name2results[n], rn) for n in all_names] assert len(x) == len(header) matrix.append(x) # Count the total mapped and total_fragments. total_mapped = [] total_fragments = [] perc_mapped = [] perc_no_feature = [] perc_ambiguous = [] for n in all_names: # Sum up the counts results = name2results[n] tm, tf, pm = "", "", "" pnf, pamb = "", "" if not results.errors: x1 = sum(results.counts.values()) x2 = 0 for rn in ROWS: x2 += getattr(results, rn) tm = x1 tf = x1 + x2 pm = tm / float(tf) pnf = results.no_feature / float(tf) pamb = results.ambiguous / float(tf) total_mapped.append(tm) total_fragments.append(tf) perc_mapped.append(pm) perc_no_feature.append(pnf) perc_ambiguous.append(pamb) x1 = ["total_mapped"] + total_mapped x2 = ["total_fragments"] + total_fragments x3 = ["perc_mapped"] + perc_mapped x4 = ["perc_no_feature"] + perc_no_feature x5 = ["perc_ambiguous"] + perc_ambiguous assert len(x1) == len(header) assert len(x2) == len(header) assert len(x3) == len(header) assert len(x4) == len(header) assert len(x5) == len(header) matrix.append(map(str, x1)) matrix.append(map(str, x2)) matrix.append(map(str, x3)) matrix.append(map(str, x4)) matrix.append(map(str, x5)) # Write the data file. handle = open(outfile, 'w') for x in matrix: print >> handle, "\t".join(map(str, x))
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib svm_node, vcf_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) metadata = {} # 1. vcf_filenames # 2. parsed_snpeff_files one for each VCF file # 3. merged_snpeff_file just one file # 4. clean_snpeff_file clean up the annotations to final form # 5. outfile merged_snpeff_file = "snpeff.merged.txt" cleaned_snpeff_file = "snpeff.clean.txt" jobs = [] for vcf_filename in vcf_filenames: path, caller, ext = mlib.splitpath(vcf_filename) parsed_snpeff_file = "%s.parsed.txt" % caller j = filelib.GenericObject( caller=caller, vcf_filename=vcf_filename, parsed_snpeff_file=parsed_snpeff_file, ) jobs.append(j) # Parse each of the snpeff files. commands = [] for j in jobs: args = j.vcf_filename, j.parsed_snpeff_file # Debugging. If this file exists, do not generate it # again. if os.path.exists(j.parsed_snpeff_file): continue x = parse_snpeff_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores # Merge the parsed files. x = [j.parsed_snpeff_file for j in jobs] x = [x for x in x if os.path.exists(x)] parsed_files = x # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(merged_snpeff_file): merge_parsed_files(parsed_files, merged_snpeff_file) # Clean up the snpEff file. Coordinates should be unique. # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(cleaned_snpeff_file): clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file) # Merge the snpEff annotations into the SimpleVariantMatrix. add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile) return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sai_node, orient_node, sample_node, reference_node = \ antecedents fastq_files = mlib.find_merged_fastq_files( sample_node.identifier, fastq_node.identifier) sai_path = sai_node.identifier assert filelib.dir_exists(sai_path) orient = mlib.read_orientation(orient_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bwa %s" % alignlib.get_bwa_version() # Technically, doesn't need the SampleGroupFile, since that's # already reflected in the sai data. But better, because the # sai data might not always be generated by BETSY. # Find the merged fastq files. # Find the sai files. sai_filenames = filelib.list_files_in_path( sai_path, endswith=".sai", case_insensitive=True) assert sai_filenames, "No .sai files." bwa = mlib.findbin("bwa") # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq> # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai> # <input_1.fq> <input_2.fq> > # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam) # all full paths jobs = [] for x in fastq_files: sample, pair1_fq, pair2_fq = x # The sai file should be in the format: # <sai_path>/<sample>.sai Single end read # <sai_path>/<sample>_1.sai Paired end read # <sai_path>/<sample>_2.sai Paired end read # Look for pair1_sai and pair2_sai. pair1_sai = pair2_sai = None for sai_filename in sai_filenames: p, s, e = mlib.splitpath(sai_filename) assert e == ".sai" if s == sample: assert not pair1_sai pair1_sai = sai_filename elif s == "%s_1" % (sample): assert not pair1_sai pair1_sai = sai_filename elif s == "%s_2" % (sample): assert not pair2_sai pair2_sai = sai_filename assert pair1_sai, "Missing .sai file: %s" % sample if pair2_fq: assert pair2_sai, "Missing .sai file 2: %s" % sample if pair2_sai: assert pair2_fq, "Missing .fq file 2: %s" % sample sam_filename = os.path.join(out_path, "%s.sam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename jobs.append(x) orientation = orient.orientation #orientation = sample_node.data.attributes["orientation"] assert orientation in ["single", "paired_fr", "paired_rf"] # Make a list of bwa commands. sq = mlib.sq commands = [] for x in jobs: sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \ sam_filename, log_filename = x if orientation == "single": assert not pair2_fq assert not pair2_sai samse = "samse" if orientation.startswith("paired"): samse = "sampe" x = [ sq(bwa), samse, "-f", sq(sam_filename), sq(ref.fasta_file_full), ] if orientation == "single": x += [ sq(pair1_sai), sq(pair1_fq), ] else: y = [ sq(pair1_sai), sq(pair2_sai), sq(pair1_fq), sq(pair2_fq), ] if orientation == "paired_rf": y = [ sq(pair2_sai), sq(pair1_sai), sq(pair2_fq), sq(pair1_fq), ] x += y x += [ ">&", sq(log_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x = [x[-2] for x in jobs] filelib.assert_exists_nz_many(x) return metadata