def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib #from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib # TODO: Merge with merge_variants_snp.py. #CALLERS = [ # "gatk", "platypus", "varscan", # ] vcf_paths = [x.identifier for x in antecedents] nodes = [x.data for x in antecedents] CALLERS = [x.attributes["caller"] for x in nodes] assert len(CALLERS) == len(vcf_paths) filelib.safe_mkdir(out_path) metadata = {} # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file) jobs = [] for i, caller in enumerate(CALLERS): inpath = vcf_paths[i] caller_h = hashlib.hash_var(caller) vcf_files = filelib.list_files_in_path( inpath, endswith=".vcf", toplevel_only=True) for file_ in vcf_files: # IN_FILE: <inpath>/<sample>.vcf # OUT_FILE: <out_path>/<caller>.vcf/<sample>.vcf p, sample, e = mlib.splitpath(file_) assert e == ".vcf" out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h) out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample) x = filelib.GenericObject( sample=sample, caller=caller, out_vcf_path=out_vcf_path, in_vcf_file=file_, out_vcf_file=out_vcf_file) jobs.append(x) # Make sure the same samples are found in all callers. caller2samples = {} for j in jobs: if j.caller not in caller2samples: caller2samples[j.caller] = [] caller2samples[j.caller].append(j.sample) comp_samples = None for caller, samples in caller2samples.iteritems(): samples = sorted(samples) if comp_samples is None: comp_samples = samples assert comp_samples == samples, "%s %s" % (comp_samples, samples) for j in jobs: filelib.safe_mkdir(j.out_vcf_path) os.symlink(j.in_vcf_file, j.out_vcf_file) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from Betsy import module_utils as mlib import merge_vcf_folder vcffolders_node = antecedents filelib.safe_mkdir(out_path) metadata = {} x = os.listdir(vcffolders_node.identifier) x = [x for x in x if x.endswith(".vcf")] assert x, "No VCF folders found: %s" % vcffolders_node.identifier x = [os.path.join(vcffolders_node.identifier, x) for x in x] vcf_folders = x jobs = [] for folder in vcf_folders: path, root, ext = mlib.splitpath(folder) assert ext == ".vcf" caller = root vcf_filenames = filelib.list_files_in_path(folder, endswith=".vcf", toplevel_only=True) assert vcf_filenames, "No .vcf files: %s" % folder out_filename = os.path.join(out_path, "%s.vcf" % root) tmp_path = "%s.indexed.vcf" % caller x = filelib.GenericObject(caller=caller, vcf_filenames=vcf_filenames, out_filename=out_filename, tmp_path=tmp_path) jobs.append(x) for j in jobs: m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames, j.out_filename, num_cores, j.tmp_path) if "commands" not in metadata: metadata["commands"] = [] metadata["commands"].extend(m["commands"]) x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel import filter_variants_GATK vcf_node = in_data vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) assert vcf_filenames, "No VCF files found." filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["snp", "indel"] metadata["filter"] = vartype jobs = [] # list of filelib.GenericObject for in_filename in vcf_filenames: p, f = os.path.split(in_filename) out_filename = os.path.join(out_path, f) x = filelib.GenericObject(in_filename=in_filename, out_filename=out_filename) jobs.append(x) # Filter each of the VCF files. jobs2 = [] for j in jobs: args = vartype, j.in_filename, j.out_filename x = filter_variants_GATK.filter_by_vartype, args, {} jobs2.append(x) parallel.pyfun(jobs2, num_procs=num_cores) metadata["num_cores"] = num_cores return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node, insert_size_node, alignment_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # ./pindel -f <reference.fa> -i <bam_configuration_file> # -c <chromosome_name> -o <out_prefix> # -T <num threads> # # Creates files: # <out_prefix>_D Deletion # <out_prefix>_SI Short insertion # <out_prefix>_LI Long insertion # <out_prefix>_INV Inversion # <out_prefix>_TD Tandem deletion # <out_prefix>_BP Breakpoint # <out_prefix>_RP ??? read pair??? # <out_prefix>_CloseEndMapped Only on end could be mapped. # Pindel cannot handle spaces in the BAM filenames (because of # the config file). Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = opj(bam_path, "%s.bam" % x) local_bai = opj(bam_path, "%s.bam.bai" % x) config_filename = opj(out_path, "%s.config.txt" % sample) out_prefix = opj(out_path, sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, config_filename=config_filename, out_prefix=out_prefix, log_filename=log_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # Read the insert sizes. summary_file = opj(insert_size_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2size = _read_insert_sizes(summary_file) # Make sure all the samples have inserts. for j in jobs: assert j.sample in sample2size, \ "Missing in insert size file: %s" % j.sample # Read the fragment sizes. summary_file = opj(alignment_node.identifier, "summary.txt") filelib.assert_exists_nz(summary_file) sample2readlen = _read_fragment_sizes(summary_file) # Make sure all the samples have read lengths. for j in jobs: assert j.sample in sample2readlen, \ "Missing in alignment summary file: %s" % j.sample # Make the config file. for j in jobs: # <insert size> is the whole length to be sequenced, including # the length of the pair of reads. Picard only counts the # sequence between the reads. size = sample2size[j.sample] read_length = sample2readlen[j.sample] insert_size = size + read_length * 2 handle = open(j.config_filename, 'w') print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample) handle.close() # Make a list of commands. pindel = mlib.get_config("pindel", which_assert_file=True) sq = parallel.quote commands = [] for j in jobs: cmd = [ sq(pindel), "-f", sq(ref.fasta_file_full), "-i", sq(j.config_filename), "-c", "ALL", "-T", 1, "-o", sq(j.out_prefix), ] cmd = " ".join(map(str, cmd)) cmd = "%s >& %s" % (cmd, j.log_filename) commands.append(cmd) parallel.pshell(commands, max_procs=num_cores) metadata["num_cores"] = num_cores metadata["commands"] = commands # Make sure the analysis completed successfully. If not, try # to diagnose. x = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x) x1 = ["%s_D" % x.out_prefix for x in jobs] x2 = ["%s_SI" % x.out_prefix for x in jobs] x3 = ["%s_LI" % x.out_prefix for x in jobs] x4 = ["%s_INV" % x.out_prefix for x in jobs] x5 = ["%s_TD" % x.out_prefix for x in jobs] x6 = ["%s_BP" % x.out_prefix for x in jobs] x = x1 + x2 + x3 + x4 + x5 + x6 filelib.assert_exists_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, nc_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out version. # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile, # fixed_outfile, filtered_outfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) orig_outfile = opj(out_path, "%s.raw" % sample) fix_outfile = opj(out_path, "%s.vcf" % sample) #filter_outfile = opj(out_path, "%s.vcf" % sample) x = cancer_sample, normal_bamfile, cancer_bamfile, \ orig_outfile, fix_outfile x = filelib.GenericObject(cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, orig_outfile=orig_outfile, fix_outfile=fix_outfile) jobs.append(x) # python /usr/local/museq/classify.py \ # normal:test31/normal.bam tumour:test31/tumor.bam \ # reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \ # model:/usr/local/museq/model_v4.1.2.npz \ # --config /usr/local/museq/metadata.config \ # -o test51.vcf opj = os.path.join museq = mlib.get_config("museq", assert_exists=True) classify_py = opj(museq, "classify.py") model_file = opj(museq, "model_v4.1.2.npz") config_file = opj(museq, "metadata.config") filelib.assert_exists_nz(classify_py) filelib.assert_exists_nz(model_file) filelib.assert_exists_nz(config_file) # museq's config file generates a broken VCF file. Fix it. fixed_config_file = "fixed.config" fix_config_file(config_file, fixed_config_file) # Generate the commands. sq = mlib.sq commands = [] for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x x = [ "python", # should allow user to specify python sq(classify_py), sq("normal:%s" % j.normal_bamfile), sq("tumour:%s" % j.cancer_bamfile), sq("reference:%s" % ref.fasta_file_full), sq("model:%s" % model_file), "--config", sq(fixed_config_file), "-o", sq(j.orig_outfile), ] x = " ".join(map(str, x)) commands.append(x) # Not sure how much RAM this takes. On Thunderbolts test, # took < 1 Gb. nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # JointSNVMix produces non-standard VCF files. Fix this so it # will work with other programs downstream. for j in jobs: #cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile) # Filter each of the VCF files. #for x in jobs: # cancer_sample, normal_bamfile, cancer_bamfile, \ # raw_outfile, fix_outfile, vcf_outfile = x # filter_by_vartype(vartype, fix_outfile, vcf_outfile) #metadata["filter"] = vartype #x = [x[-1] for x in jobs] x = [j.fix_outfile for x in jobs] filelib.assert_exists_many(x) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils bam_filenames = module_utils.find_bam_files(in_data.identifier) assert bam_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) s, ext = os.path.splitext(f) sample = hashlib.hash_var(s) log_filename = os.path.join(out_path, "%s.log" % s) out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, sample=sample, log_filename=log_filename, out_filename=out_filename) jobs.append(x) gid = "group1" library = "library" platform_unit = "platform" #sample = "sample" platform = "illumina" # java -Xmx5g -jar AddOrReplaceReadGroups.jar # I=<input.sam or .bam> O=<output.bam> ID=<group ID> # LB=<group library> PU=<platform unit> SM=<group sample name> # PL=<platform> CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT picard_jar = alignlib.find_picard_jar("picard") # Make a list of commands. sq = parallel.quote commands = [] for j in jobs: x = [ "java", "-Xmx5g", "-jar", sq(picard_jar), "AddOrReplaceReadGroups", "I=%s" % sq(j.in_filename), "O=%s" % sq(j.out_filename), "ID=%s" % gid, "LB=%s" % library, "PU=%s" % platform_unit, "SM=%s" % j.sample, "PL=%s" % platform, #"CREATE_INDEX=true", "VALIDATION_STRINGENCY=LENIENT", ] x = " ".join(x) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands metadata["num_cores"] = num_cores # Make sure the analysis completed successfully. # Make sure outfiles exist. out_filenames = [j.out_filename for x in jobs] filelib.assert_exists_nz_many(out_filenames) # Check the log files to make sure there are no error. for j in jobs: check_log_file(j.log_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib #import call_variants_GATK bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] # Platypus generates an error if there are spaces in the BAM # filename. Symlink the file to a local directory to make # sure there are no spaces. bam_path = "bam" jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) bai_filename = "%s.bai" % bam_filename filelib.assert_exists_nz(bai_filename) x = sample.replace(" ", "_") local_bam = os.path.join(bam_path, "%s.bam" % x) local_bai = os.path.join(bam_path, "%s.bam.bai" % x) log_filename = os.path.join(out_path, "%s.log" % sample) err_filename = os.path.join(out_path, "%s.err" % sample) # Unfiltered file. #raw_filename = os.path.join(out_path, "%s.raw" % sample) # Final VCF file. out_filename = os.path.join(out_path, "%s.vcf" % sample) x = filelib.GenericObject(bam_filename=bam_filename, bai_filename=bai_filename, local_bam=local_bam, local_bai=local_bai, log_filename=log_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) filelib.safe_mkdir(bam_path) for j in jobs: assert " " not in j.local_bam filelib.assert_exists_nz(j.bam_filename) filelib.assert_exists_nz(j.bai_filename) if not os.path.exists(j.local_bam): os.symlink(j.bam_filename, j.local_bam) if not os.path.exists(j.local_bai): os.symlink(j.bai_filename, j.local_bai) # TODO: Keep better track of the metadata. buffer_size = 100000 max_reads = 5E6 # Running into errors sometimes, so increase these numbers. # WARNING - Too many reads (5000000) in region # 1:500000-600000. Quitting now. Either reduce --bufferSize or # increase --maxReads. buffer_size = buffer_size * 10 max_reads = max_reads * 10 # Make a list of commands. commands = [] for j in jobs: #nc = max(1, num_cores/len(jobs)) x = alignlib.make_platypus_command(bam_file=j.local_bam, ref_file=ref.fasta_file_full, log_file=j.log_filename, out_file=j.out_filename, buffer_size=buffer_size, max_reads=max_reads) x = "%s >& %s" % (x, j.err_filename) commands.append(x) #for x in commands: # print x #import sys; sys.exit(0) parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. If not, try # to diagnose. for j in jobs: if filelib.exists_nz(j.out_filename): continue for line in open(j.err_filename): if line.find("WARNING - Too many reads") >= 0: print line, x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) # Filter each of the VCF files. #for j in jobs: # call_variants_GATK.filter_by_vartype( # vartype, j.raw_filename, j.out_filename) #metadata["filter"] = vartype return metadata
def run( self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # java -jar picard.jar CollectAlignmentSummaryMetrics \ # R=reference_sequence.fasta \ # I=input.bam \ # O=output.txt opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample) log_filename = opj(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, bam_filename=bam_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # Make the commands to run picard. picard_jar = alignlib.find_picard_jar("picard") sq = parallel.quote commands = [] for j in jobs: # Should have better way of getting java path. cmd = [ "java", "-Xmx10g", "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics", "I=%s" % sq(j.bam_filename), "R=%s" % sq(ref.fasta_file_full), "O=%s" % sq(j.out_filename), ] cmd = " ".join(cmd) cmd = "%s >& %s" % (cmd, sq(j.log_filename)) commands.append(cmd) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Summarize the insert size files. outfile = opj(out_path, "summary.txt") _summarize_alignment_summary_metrics(jobs, outfile) filelib.assert_exists_nz(outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, ref_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) # Do a quick check to make sure the reference is correct. # Otherwise, error may be hard to disgnose. alignlib.assert_is_STAR_reference(ref.path) metadata = {} metadata["tool"] = "STAR %s" % alignlib.get_STAR_version() # Figure out the strandedness. is_stranded = False # STAR --runThreadN 40 --genomeDir test05 \ # --readFilesIn test.fastq/test03_R1_001.fastq \ # test.fastq/test03_R2_001.fastq --outFileNamePrefix test06. # If unstranded, add --outSAMstrandField intronMotif # Make a list of the jobs to run. jobs = [] # list of filelib.GenericObject objects for x in fastq_files: sample, pair1, pair2 = x out_prefix = "%s." % sample bam_filename = os.path.join(out_path, "%sAligned.out.bam" % out_prefix) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, pair1=pair1, pair2=pair2, out_prefix=out_prefix, bam_filename=bam_filename, log_filename=log_filename, ) jobs.append(x) # Run pass 1. commands = [] for j in jobs: x = os.path.join(out_path, j.out_prefix) cmd = alignlib.make_STAR_command(ref.path, x, num_cores, is_stranded, j.pair1, j.pair2, j.log_filename) # For debugging. If this file already exists, skip it. if not filelib.exists_nz(j.bam_filename): parallel.sshell(cmd, path=out_path) filelib.assert_exists_nz(j.bam_filename) commands.append(cmd) metadata["commands"] = commands metadata["num_cores"] = num_cores return metadata
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import filelib from genomicode import parallel from genomicode import alignlib #from genomicode import hashlib from Betsy import module_utils in_filenames = module_utils.find_bam_files(in_data.identifier) assert in_filenames, "No .bam files." filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "samtools %s" % alignlib.get_samtools_version() jobs = [] #seen = {} for i, in_filename in enumerate(in_filenames): p, f = os.path.split(in_filename) temp_prefix = "temp_%s" % f #temp_prefix = "temp_%s" % hashlib.hash_var(f) # Make sure no duplicates. #assert temp_prefix not in seen #seen[temp_prefix] = 1 #temp_outfilename = "%d.bam" % i out_filename = os.path.join(out_path, f) x = filelib.GenericObject( in_filename=in_filename, temp_prefix=temp_prefix, #temp_outfilename=temp_outfilename, out_filename=out_filename) jobs.append(x) samtools = filelib.which_assert(config.samtools) # Calculate the number of threads per process. nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores) num_threads = max(nc / len(jobs), 1) # Make a list of samtools commands. # Without -m, takes ~1 Gb per process. sq = parallel.quote commands = [] for j in jobs: # Usage has changed. Below no longer valid. # samtools sort <in_filename> <out_filestem> # .bam automatically added to <out_filestem>, so don't # need it. #x = out_filename #assert x.endswith(".bam") #x = x[:-4] #out_filestem = x x = [ sq(samtools), "sort", "-O", "bam", "-T", sq(j.temp_prefix), "-m", "4G", # Crashing, so try increasing memory. sq(j.in_filename), #"-o", sq(j.temp_outfilename), "-o", sq(j.out_filename), ] if num_threads > 1: x += ["-@", num_threads] x = " ".join(map(str, x)) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = nc parallel.pshell(commands, max_procs=nc) #for cmd in commands: # parallel.sshell(cmd) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_outfilename, j.out_filename) # Make sure the analysis completed successfully. x = [j.out_filename for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import vcflib from Betsy import module_utils as mlib vcf_node, nc_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf") assert vcf_filenames, "No .vcf files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Filenames: # <caller>.vcf wgs_or_wes = mlib.get_user_option(user_options, "wgs_or_wes", not_empty=True, allowed_values=["wgs", "wes"]) genome = mlib.get_user_option(user_options, "snpeff_genome", not_empty=True) databases = list_snpeff_databases() assert genome in databases, "Unknown genome database: %s" % genome # For each caller, do the SnpEFF calls. Some callers include # the somatic information, others do not. If germline samples # are present, then do with _cancer. Otherwise, do not. # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt # GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log # Don't bother annotating positions that do not pass filter. # Filter them out first based on FILTER column. opj = os.path.join jobs = [] for in_filename in vcf_filenames: path, stem, ext = mlib.splitpath(in_filename) samples_file = opj(out_path, "%s.cancerSamples.txt" % stem) filtered_filename = opj(out_path, "%s.filtered_input" % stem) out_filename = opj(out_path, "%s.vcf" % stem) log_filename = opj(out_path, "%s.log" % stem) x = filelib.GenericObject(in_filename=in_filename, samples_file=samples_file, filtered_filename=filtered_filename, out_filename=out_filename, log_filename=log_filename) jobs.append(x) # First, filter each of the VCF files. commands = [] for j in jobs: # For debugging. If this file exists, don't filter it again. if os.path.exists(j.filtered_filename): continue args = j.in_filename, j.filtered_filename, wgs_or_wes x = vcflib.filter_vcf_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) # Make the cancer_samples files. for j in jobs: # Will generate this if there are cancer samples. make_cancer_samples_file(j.filtered_filename, nc_match, j.samples_file) # Make a list of commands. commands = [] for j in jobs: cancer = False if os.path.exists(j.samples_file): cancer = True x = make_snpeff_command(j.filtered_filename, genome, j.out_filename, j.log_filename, is_cancer=cancer, cancer_samples_file=j.samples_file) commands.append(x) nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["commands"] = commands metadata["num_cores"] = nc # Make sure the analysis completed successfully. x = [x.out_filename for x in jobs] filelib.assert_exists_nz_many(x) # Log files should be empty. for j in jobs: filelib.assert_exists(j.log_filename) assert not filelib.exists_nz(j.log_filename), \ "Error with %s.\n%s" % (j.stem, j.log_filename) filelib.safe_unlink(j.log_filename) return metadata
def calc_gsea(expression_file, class_label_file, user_options, num_cores, out_path, permutation_type, database): import os import arrayio from genomicode import parallel from genomicode import arraysetlib from genomicode import hashlib from genomicode import filelib from genomicode import genesetlib from Betsy import module_utils as mlib names, classes = arraysetlib.read_cls_file(class_label_file) assert names assert len(names) >= 2, ("At least 2 classes needed for GSEA analysis. " "Found only: %s" % (names[0])) # Make sure there are the same number of samples in the class # label file as in the gene expression file. MATRIX = arrayio.read(expression_file) assert MATRIX.ncol() == len(classes), ( "Mismatch: expression (%d) classes (%d)" % (MATRIX.ncol(), len(classes))) # Make sure classes go from [0, len(names)) for i in classes: assert i >= 0 and i < len(names) fdr_cutoff = mlib.get_user_option(user_options, "gsea_fdr_cutoff", not_empty=True, type=float) assert fdr_cutoff > 0 and fdr_cutoff <= 1 # Find all combinations of names and classes. opj = os.path.join jobs = [] for i1 in range(len(names) - 1): for i2 in range(i1 + 1, len(names)): N1 = names[i1] N2 = names[i2] # Indexes should be 1-based. I1 = [i + 1 for i in range(len(classes)) if classes[i] == i1] I2 = [i + 1 for i in range(len(classes)) if classes[i] == i2] N1_h = hashlib.hash_var(N1) N2_h = hashlib.hash_var(N2) stem = "%s.vs.%s" % (N1_h, N2_h) gsea_path = opj(out_path, "%s.%s.gsea" % (stem, database)) x = filelib.GenericObject(N1=N1, N2=N2, I1=I1, I2=I2, stem=stem, gsea_path=gsea_path) jobs.append(x) permutation_types = {} commands = [] for j in jobs: # Need at least 3 samples for "phenotype" permutations. If # there are fewer samples, then set to "gene_set". if len(I1) < 3 or len(I2) < 3: permutation_type = "gene_set" permutation_types[permutation_type] = 1 cmd = make_gsea_command(expression_file, class_label_file, j.gsea_path, j.N1, j.N2, j.I1, j.I2, permutation_type, database) commands.append(cmd) for cmd in commands: parallel.sshell(cmd) # Summarize results. # Make a geneset file. significant = [] for j in jobs: x = find_significant_gene_sets(j.gsea_path, j.N1, j.N2, fdr_cutoff) significant.append(x) genesets = [] for j, x in zip(jobs, significant): genes1, genes2 = x gs_name1 = "%s_%s" % (j.stem, j.N1) gs_name2 = "%s_%s" % (j.stem, j.N2) gs1 = genesetlib.GeneSet(gs_name1, "", genes1) gs2 = genesetlib.GeneSet(gs_name2, "", genes2) genesets.extend([gs1, gs2]) x = "genesets.fdr_%g.gmt" % fdr_cutoff geneset_file = opj(out_path, x) genesetlib.write_gmt(geneset_file, genesets) # Count the number of significant gene sets. x = "num_genesets.fdr_%g.txt" % fdr_cutoff summary_file = opj(out_path, x) handle = open(summary_file, 'w') header = "Group 1", "Group 2", "Gene Sets in Group 1", \ "Gene Sets in Group 2" print >> handle, "\t".join(header) for j, x in zip(jobs, significant): genes1, genes2 = x x = j.N1, j.N2, len(genes1), len(genes2) assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() return commands, sorted(permutation_types)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib svm_node, vcf_node = antecedents vcf_filenames = filelib.list_files_in_path(vcf_node.identifier, endswith=".vcf", not_empty=True) metadata = {} # 1. vcf_filenames # 2. parsed_snpeff_files one for each VCF file # 3. merged_snpeff_file just one file # 4. clean_snpeff_file clean up the annotations to final form # 5. outfile merged_snpeff_file = "snpeff.merged.txt" cleaned_snpeff_file = "snpeff.clean.txt" jobs = [] for vcf_filename in vcf_filenames: path, caller, ext = mlib.splitpath(vcf_filename) parsed_snpeff_file = "%s.parsed.txt" % caller j = filelib.GenericObject( caller=caller, vcf_filename=vcf_filename, parsed_snpeff_file=parsed_snpeff_file, ) jobs.append(j) # Parse each of the snpeff files. commands = [] for j in jobs: args = j.vcf_filename, j.parsed_snpeff_file # Debugging. If this file exists, do not generate it # again. if os.path.exists(j.parsed_snpeff_file): continue x = parse_snpeff_file, args, {} commands.append(x) parallel.pyfun(commands, num_procs=num_cores) metadata["num_cores"] = num_cores # Merge the parsed files. x = [j.parsed_snpeff_file for j in jobs] x = [x for x in x if os.path.exists(x)] parsed_files = x # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(merged_snpeff_file): merge_parsed_files(parsed_files, merged_snpeff_file) # Clean up the snpEff file. Coordinates should be unique. # For debugging, don't regenerate if I don't need to. if not filelib.exists_nz(cleaned_snpeff_file): clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file) # Merge the snpEff annotations into the SimpleVariantMatrix. add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, orient_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(reference_node.identifier) assert os.path.exists(ref.fasta_file_full) orient = mlib.read_orientation(orient_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version() # Bowtie2 doesn't handle files with spaces in them. Make # temporary files without spaces. # Make a list of the jobs to run. jobs = [] for i, x in enumerate(fastq_files): sample, pair1, pair2 = x bam_filename = os.path.join(out_path, "%s.bam" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) sample_h = hashlib.hash_var(sample) temp_pair1 = "%d_%s_1.fa" % (i, sample_h) temp_pair2 = None if pair2: temp_pair2 = "%d_%s_2.fa" % (i, sample_h) j = filelib.GenericObject(sample=sample, pair1=pair1, pair2=pair2, temp_pair1=temp_pair1, temp_pair2=temp_pair2, bam_filename=bam_filename, log_filename=log_filename) jobs.append(j) for j in jobs: os.symlink(j.pair1, j.temp_pair1) if pair2: os.symlink(j.pair2, j.temp_pair2) # Generate bowtie2 commands for each of the files. attr2orient = { "single": None, "paired_fr": "fr", "paired_rf": "rf", "paired_ff": "ff", } orientation = attr2orient[orient.orientation] #x = sample_node.data.attributes["orientation"] #orientation = attr2orient[x] # Takes ~4 Gb per job. samtools = mlib.findbin("samtools") sq = parallel.quote commands = [] for j in jobs: #sample, pair1, pair2, bam_filename, log_filename = x nc = max(1, num_cores / len(jobs)) # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr # 2> test.log | samtools view -bS -o test.bam - x1 = alignlib.make_bowtie2_command(ref.fasta_file_full, j.temp_pair1, fastq_file2=j.temp_pair2, orientation=orientation, num_threads=nc) x2 = [ sq(samtools), "view", "-bS", "-o", sq(j.bam_filename), "-", ] x2 = " ".join(x2) x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2) #x = "%s >& %s" % (x, sq(log_filename)) commands.append(x) metadata["commands"] = commands parallel.pshell(commands, max_procs=num_cores) # Make sure the analysis completed successfully. x1 = [x.bam_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import ngslib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib bam_node, ref_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} features_bed = mlib.get_user_option(user_options, "features_bed", check_file=True) if features_bed: metadata["features_bed"] = features_bed # Applies to genomecov. min_coverage = user_options.get("ignore_coverage_below") if min_coverage == "": min_coverage = None if min_coverage is not None: min_coverage = int(min_coverage) assert min_coverage >= 0 metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version() metadata["num_cores"] = num_cores metadata["commands"] = [] # Set up the filenames. # list of ( # sample, # orig_bam_filename, Original bam filename. # bam_filename, bam file, after filtering out unmapped reads. # genomecov_filename, Generated by genomecov. Histogram. # histo_datafile, Data file to generate histogram (from cov). # histo_plotfile, Histogram plot. # histo_prismfile, To make histogram in PRISM. # # ONLY USED IF features_bed # intervallist_file, Made from BED file. # cov_filename, Generated by Picard. # targetcov_filename, Generated by Picard. Per target coverage. # log_filename, Output from Picard. # ) opj = os.path.join jobs = [] # list of filelib.GenericObject for bam_filename in bam_filenames: # <in_path>/<sample>.bam in_path, sample, ext = mlib.splitpath(bam_filename) assert ext == ".bam" clean_bam_filename = opj(out_path, "%s.bam" % sample) assert clean_bam_filename != bam_filename genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample) histo_datafile = opj(out_path, "%s.histo.txt" % sample) histo_plotfile = opj(out_path, "%s.histo.png" % sample) histo_prismfile = opj(out_path, "%s.prism.txt" % sample) intervallist_file = opj(out_path, "%s.interval.txt" % sample) cov_filename = opj(out_path, "%s.coverage.txt" % sample) targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample) log_filename = opj(out_path, "%s.picard.log" % sample) x = filelib.GenericObject(sample=sample, orig_bam_filename=bam_filename, bam_filename=clean_bam_filename, genomecov_filename=genomecov_filename, histo_datafile=histo_datafile, histo_plotfile=histo_plotfile, histo_prismfile=histo_prismfile, intervallist_file=intervallist_file, cov_filename=cov_filename, targetcov_filename=targetcov_filename, log_filename=log_filename) #x = sample, bam_filename, genomecov_filename, \ # histo_datafile, histo_plotfile, histo_prismfile, \ # intervallist_file, cov_filename, targetcov_filename, \ # log_filename jobs.append(x) # Remove unmapped reads from the BAM files. # Need to remove the unmapped reads or Picard might complain: # Exception in thread "main" # htsjdk.samtools.SAMFormatException: SAM validation error: # ERROR: Record 154286082, Read name # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0 # for unmapped read. # # This can happen with BWA generated alignments. cmds = [] for x in jobs: x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename) cmds.append(x) parallel.pshell(cmds, max_procs=num_cores) x = [x.bam_filename for x in jobs] filelib.assert_exists_nz_many(x) # Generate the intervallist_file(s). if features_bed: cmds = [] for x in jobs: args = x.intervallist_file, features_bed, x.bam_filename x = _make_intervallist_file, args, {} cmds.append(x) parallel.pyfun(cmds, num_procs=num_cores) # Make the commands to run picard. if features_bed: commands = [] for x in jobs: x = _make_calculatehsmetrics_command( x.intervallist_file, x.bam_filename, x.cov_filename, x.targetcov_filename, ref.fasta_file_full, x.log_filename) commands.append(x) metadata["commands"].append(commands) parallel.pshell(commands, max_procs=num_cores) x1 = [x.cov_filename for x in jobs] x2 = [x.targetcov_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) # Use genomecov to count read depth. x = _run_genomecov(jobs, ref_node.identifier, num_cores) metadata["commands"].append(x) # Summarize the average read depth. summary_file = opj(out_path, "summary.xls") _summarize_average_read_depth(jobs, min_coverage, summary_file) # Make histograms of the distribution of the read depth for # each sample. for x in jobs: _make_histo_file(x.genomecov_filename, x.histo_datafile) # Delete the filtered BAM files to save space. for x in jobs: filelib.assert_exists_nz(x.bam_filename) os.unlink(x.bam_filename) return metadata
def merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path): # Put indexed files in tmp_path. import os import stat import shutil from genomicode import filelib from genomicode import hashlib from genomicode import parallel from Betsy import module_utils as mlib # TODO: find the version number of these tools. bgzip = mlib.findbin("bgzip") tabix = mlib.findbin("tabix") bcftools = mlib.findbin("bcftools") sq = parallel.quote tmp_path = os.path.realpath(tmp_path) filelib.safe_mkdir(tmp_path) # Keep track of all commands run. metadata = {} metadata["commands"] = [] # Ignore VCF files that don't have any variants. vcf_filenames = [x for x in vcf_filenames if os.stat(x)[stat.ST_SIZE] > 0] # If there are no VCF files with any variants, then just create an # empty outfile and return. if not vcf_filenames: open(out_filename, 'w') return # 1. Copy VCF files to temporary directory. tmp_filename # 2. Fix VCF files (e.g. NextGENe, JointSNVMix broken) # 3. Sort the VCF files (needed for tabix) # 4. Compress (bgzip) # 5. Index (tabix) # 6. Merge jobs = [] for in_filename in vcf_filenames: path, root, ext = mlib.splitpath(in_filename) sample = root x = "%s%s" % (hashlib.hash_var(root), ext) tmp_filename = os.path.join(tmp_path, x) x = filelib.GenericObject( sample=sample, in_filename=in_filename, tmp_filename=tmp_filename, ) jobs.append(x) # Make sure temporary files are unique. seen = {} for j in jobs: assert j.tmp_filename not in seen seen[j.tmp_filename] = 1 # Merge them in order of sample. The germline sample will be # duplicated, and we will know the order of the germline sample. schwartz = [(x.sample, x) for x in jobs] schwartz.sort() jobs = [x[-1] for x in schwartz] # Copy all the VCF files to a temporary directory. for j in jobs: shutil.copy2(j.in_filename, j.tmp_filename) #for j in jobs: # make_file_smaller(j.tmp_filename, 1000) for j in jobs: # NextGENe creates broken VCF files. Fix them. fix_nextgene_vcf(j.tmp_filename) # JointSNVMix creates broken VCF files. Fix them. fix_jointsnvmix_vcf(j.tmp_filename) for j in jobs: sort_vcf_file(j.tmp_filename) ## # Since we are merging the files, we need to make sure that ## # each file has a unique name. If the names aren't unique, ## # then make them unique by adding the name of the file. ## all_unique = True ## seen = {} ## for x in jobs: ## sample, in_filename, tmp_filename = x ## samples = _get_samples_from_vcf(tmp_filename) ## for s in samples: ## if s in seen: ## all_unique = False ## break ## seen[s] = 1 ## if not all_unique: ## break ## if not all_unique: ## for x in jobs: ## sample, in_filename, tmp_filename = x ## _uniquify_samples_in_vcf(tmp_filename, sample) # Compress the VCF files. # bgzip file.vcf commands = [] for j in jobs: x = "%s %s" % (sq(bgzip), sq(j.tmp_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores, path=tmp_path) metadata["commands"].extend(commands) metadata["num_cores"] = num_cores x = ["%s.gz" % x.tmp_filename for x in jobs] filelib.assert_exists_nz_many(x) # Index the VCF files. # tabix -p vcf file.vcf.gz commands = [] for j in jobs: x = "%s -p vcf %s.gz" % (sq(tabix), sq(j.tmp_filename)) commands.append(x) parallel.pshell(commands, max_procs=num_cores, path=tmp_path) metadata["commands"].extend(commands) x = ["%s.gz.tbi" % j.tmp_filename for j in jobs] filelib.assert_exists_nz_many(x) # Run bcftools ## For VCF files from somatic calls, the germline sample will ## be duplicated. Add --force-samples to make sure this is ## still merged. # Since we need to append all the VCF files, it's easy to run # into error: # OSError: [Errno 7] Argument list too long # # To reduce the chance of this, figure out the path of the # tmp_filename, and run the analysis in that path so we can # use relative filenames. tmp_path = None for j in jobs: path, file_ = os.path.split(j.tmp_filename) if tmp_path is None: tmp_path = path assert path == tmp_path cmd = [ sq(bcftools), "merge", "-o %s" % sq(out_filename), "-O v", "--force-samples", ] for j in jobs: path, file_ = os.path.split(j.tmp_filename) assert path == tmp_path cmd.append("%s.gz" % file_) x = " ".join(cmd) parallel.sshell(x, path=tmp_path) metadata["commands"].append(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, ref_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) # Do a quick check to make sure the reference is correct. # Otherwise, error may be hard to disgnose. alignlib.assert_is_STAR_reference(ref.path) metadata = {} metadata["tool"] = "STAR %s" % alignlib.get_STAR_version() x = mlib.get_user_option(user_options, "two_pass", allowed_values=["no", "yes"]) two_pass = (x == "yes") # Figure out the strandedness. is_stranded = stranded.stranded != "unstranded" # STAR --runThreadN 40 --genomeDir test05 \ # --readFilesIn test.fastq/test03_R1_001.fastq \ # test.fastq/test03_R2_001.fastq --outFileNamePrefix test06. # If unstranded, add --outSAMstrandField intronMotif # Make a list of the jobs to run. jobs = [] # list of filelib.GenericObject objects for x in fastq_files: sample, pair1, pair2 = x pass1_out_prefix = "p1.%s." % sample pass2_out_prefix = "%s." % sample pass1_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass1_out_prefix) pass2_bam_filename = os.path.join( out_path, "%sAligned.out.bam" % pass2_out_prefix) sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample) log1_filename = os.path.join(out_path, "p1.%s.log" % sample) log2_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject( sample=sample, pair1=pair1, pair2=pair2, pass1_out_prefix=pass1_out_prefix, pass2_out_prefix=pass2_out_prefix, pass1_bam_filename=pass1_bam_filename, pass2_bam_filename=pass2_bam_filename, sjdb_filename=sjdb_filename, log1_filename=log1_filename, log2_filename=log2_filename, ) jobs.append(x) # Run pass 1. commands = [] for j in jobs: x = os.path.join(out_path, j.pass1_out_prefix) cmd = alignlib.make_STAR_command(ref.path, x, num_cores, is_stranded, j.pair1, j.pair2, j.log1_filename) # For debugging. If this file already exists, skip it. if not filelib.exists_nz(j.pass1_bam_filename): parallel.sshell(cmd, path=out_path) filelib.assert_exists_nz(j.pass1_bam_filename) commands.append(cmd) if two_pass: # Make a new index with the splice junction information. sj_index = os.path.join(out_path, "genome.2pass") x = [x.sjdb_filename for x in jobs] filelib.assert_exists_nz_many(x) x = alignlib.make_STAR_index_command(ref.fasta_file_full, sj_index, sjdb_files=x, num_cores=num_cores) x = "%s >& genome.2pass.log" % x commands.append(x) # For debugging. If this file already exists, skip it. if not filelib.exists_nz("genome.2pass.log"): parallel.sshell(x, path=out_path) alignlib.assert_is_STAR_reference(sj_index) # Run pass 2. for j in jobs: # For debugging. If this file already exists, skip it. if os.path.exists(j.pass2_bam_filename): continue if two_pass: x = os.path.join(out_path, j.pass2_out_prefix) cmd = alignlib.make_STAR_command(sj_index, x, num_cores, is_stranded, j.pair1, j.pair2, j.log2_filename) parallel.sshell(cmd, path=out_path) commands.append(cmd) else: # link pass1_bam_filename to pass2_bam_filename os.symlink(j.pass1_bam_filename, j.pass2_bam_filename) continue filelib.assert_exists_nz(j.pass2_bam_filename) metadata["commands"] = commands metadata["num_cores"] = num_cores # STAR takes 28 Gb per process. Make sure we don't use up # more memory than is available on the machine. # Defaults: # --limitGenomeGenerateRAM 31000000000 # --outFilterMismatchNmax 10 Num mismatches. #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores) #metadata["num_cores"] = nc #parallel.pshell(commands, max_procs=nc, path=out_path) # Make sure the analysis completed successfully. #x = [x[-2] for x in jobs] # sam_filename #filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import config from genomicode import parallel from genomicode import alignlib from genomicode import filelib from Betsy import module_utils bam_node, ref_node, pos_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # Positions file has 0-based coordinates (like BAM files). # But samtools requires 1-based coordinates. Convert to # 1-based coordinates. positions_filename = "positions.txt" outhandle = open(positions_filename, 'w') for x in filelib.read_cols(pos_node.identifier): assert len(x) == 2 chrom, pos = x pos = int(pos) + 1 # convert from 0- to 1-based coords. x = chrom, pos print >> outhandle, "\t".join(map(str, x)) outhandle.close() # list of (in_filename, err_filename, out_filename) jobs = [] for in_filename in bam_filenames: p, f = os.path.split(in_filename) sample, ext = os.path.splitext(f) err_filename = os.path.join(out_path, "%s.log" % sample) out_filename = os.path.join(out_path, "%s.pileup" % sample) x = filelib.GenericObject(in_filename=in_filename, err_filename=err_filename, out_filename=out_filename) jobs.append(x) ## Get possible positions file. #positions_filename = module_utils.get_user_option( # user_options, "positions_file", check_file=True) # Figure out whether the purpose is to get coverage. Change # the parameters if it is. assert "vartype" in out_attributes vartype = out_attributes["vartype"] assert vartype in ["all", "snp", "indel", "consensus"] #if cov == "yes": # assert positions_filename, "Missing: positions_file" # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \ # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \ # $i > $j" samtools = filelib.which_assert(config.samtools) # Get an error if the BAM files are not indexed. # [W::bam_hdr_read] EOF marker is absent. The input is probably # truncated. #if vartype == "consensus": # args = [ # "-R", # Ignore read group tags. # "-B", # Disable BAQ (base quality) computation. # "-q", 0, # Skip bases with mapQ smaller than this. # "-Q", 0, # Skip bases with BAQ smaller than this. # "-d10000000", # Allow deep reads. # ] #else: # raise NotImplementedError args = [ "-R", # Ignore read group tags. "-B", # Disable BAQ (base quality) computation. "-q", 0, # Skip bases with mapQ smaller than this. "-Q", 0, # Skip bases with BAQ smaller than this. "-d10000000", # Allow deep reads. ] sq = parallel.quote commands = [] for j in jobs: x = [ sq(samtools), "mpileup", "-f", sq(ref.fasta_file_full), ] if positions_filename: x.extend(["-l", positions_filename]) x.extend(args) x.append(sq(j.in_filename)) x = " ".join(map(str, x)) x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename) commands.append(x) #for x in commands: # print x parallel.pshell(commands, max_procs=num_cores) metadata["commands"] = commands # File may be empty if there are no reads. x = [x.out_filename for x in jobs] filelib.assert_exists_many(x) # Make sure there's no errors in the log files. for j in jobs: check_log_file(j.err_filename) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import alignlib from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_files, "I could not find any FASTQ files." ref = alignlib.create_reference_genome(reference_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version() # Figure out whether to align to genome or transcriptome. x = out_attributes["align_to"] assert x in ["genome", "transcriptome"] align_to_genome = (x == "genome") # RSEM makes files: # <sample_name>.genome.bam # <sample_name>.transcript.bam # <sample_name>.genes.results # <sample_name>.isoforms.results # <sample_name>.stat # # Does not work right if there is a space in the sample name. # Therefore, give a hashed sample name, and then re-name # later. # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sample_h = hashlib.hash_var(sample) x1, x2, x3 = mlib.splitpath(pair1) x = "%s%s" % (hashlib.hash_var(x2), x3) pair1_h = os.path.join(out_path, x) if pair2: x1, x2, x3 = mlib.splitpath(pair2) x = "%s%s" % (hashlib.hash_var(x2), x3) pair2_h = os.path.join(out_path, x) results_filename = os.path.join(out_path, "%s.genes.results" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, sample_h=sample_h, pair1=pair1, pair2=pair2, pair1_h=pair1_h, pair2_h=pair2_h, results_filename=results_filename, log_filename=log_filename) jobs.append(x) # Make sure hashed samples are unique. seen = {} for j in jobs: assert j.sample_h not in seen, \ "Dup (%d): %s" % (len(jobs), j.sample_h) assert j.pair1_h not in seen assert j.pair2_h not in seen seen[j.sample_h] = 1 seen[j.pair1_h] = 1 seen[j.pair2_h] = 1 # Symlink the fastq files. for j in jobs: os.symlink(j.pair1, j.pair1_h) if j.pair2: os.symlink(j.pair2, j.pair2_h) s2fprob = { "unstranded": None, "firststrand": 0.0, "secondstrand": 1.0, } assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \ stranded.stranded forward_prob = s2fprob[stranded.stranded] # How much memory for bowtie. May need to increase this if # there are lots of memory warnings in the log files: # Warning: Exhausted best-first chunk memory for read # ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1 # (patid 2076693); skipping read # Default is 64. # Seems like too high a value can cause problems. #chunkmbs = 4*1024 # Generates warnings. chunkmbs = 512 # Get lots of warnings with bowtie: # Warning: Detected a read pair whose two mates have different names # Use STAR aligner instead. use_STAR = True sq = parallel.quote commands = [] for j in jobs: # Debug: If the results file exists, don't run it again. if filelib.exists_nz(j.results_filename) and \ filelib.exists(j.log_filename): continue # If using the STAR aligner, then most memory efficient # way is to let STAR take care of the multiprocessing. nc = max(1, num_cores / len(jobs)) if use_STAR: nc = num_cores keywds = {} if use_STAR: keywds["align_with_star"] = True else: keywds["align_with_bowtie2"] = True x = alignlib.make_rsem_command(ref.fasta_file_full, j.sample_h, j.pair1_h, fastq_file2=j.pair2_h, forward_prob=forward_prob, output_genome_bam=align_to_genome, bowtie_chunkmbs=chunkmbs, num_threads=nc, **keywds) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores # Need to run in out_path. Otherwise, files will be everywhere. nc = num_cores if use_STAR: nc = 1 parallel.pshell(commands, max_procs=nc, path=out_path) # Rename the hashed sample names back to the original unhashed # ones. files = os.listdir(out_path) rename_files = [] # list of (src, dst) for j in jobs: if j.sample == j.sample_h: continue for f in files: if not f.startswith(j.sample_h): continue src = os.path.join(out_path, f) x = j.sample + f[len(j.sample_h):] dst = os.path.join(out_path, x) rename_files.append((src, dst)) for src, dst in rename_files: filelib.assert_exists(src) os.rename(src, dst) # Delete the symlinked fastq files. for j in jobs: filelib.safe_unlink(j.pair1_h) filelib.safe_unlink(j.pair2_h) # Make sure the analysis completed successfully. x1 = [x.results_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils bam_node, ref_node = antecedents bam_filenames = module_utils.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." ref = alignlib.create_reference_genome(ref_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. ## Figure out whether the user wants SNPs or INDELs. #assert "vartype" in out_attributes #vartype = out_attributes["vartype"] #assert vartype in ["all", "snp", "indel"] jobs = [] for bam_filename in bam_filenames: p, f = os.path.split(bam_filename) sample, ext = os.path.splitext(f) #raw_outfile = os.path.join(out_path, "%s.raw" % sample) vcf_outfile = os.path.join(out_path, "%s.vcf" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(bam_filename=bam_filename, vcf_outfile=vcf_outfile, log_filename=log_filename) jobs.append(x) # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar # -T HaplotypeCaller -R ucsc.hg19.fasta # -dontUseSoftClippedBases -stand_call_conf 20.0 # -stand_emit_conf 20.0 -I $i -o $j # Make a list of commands. commands = [] for j in jobs: # For debugging. If exists, don't do it again. #if filelib.exists_nz(j.raw_outfile): if filelib.exists_nz(j.vcf_outfile): continue x = alignlib.make_GATK_command(T="HaplotypeCaller", R=ref.fasta_file_full, dontUseSoftClippedBases=None, stand_call_conf=20.0, stand_emit_conf=20.0, I=j.bam_filename, o=j.vcf_outfile) x = "%s >& %s" % (x, j.log_filename) commands.append(x) parallel.pshell(commands, max_procs=num_cores) # Filter each of the VCF files. #for j in jobs: # filter_by_vartype(vartype, j.raw_outfile, j.vcf_outfile) #metadata["filter"] = vartype # Make sure the analysis completed successfully. x = [j.vcf_outfile for j in jobs] filelib.assert_exists_nz_many(x) return metadata
def run( self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import parallel from genomicode import filelib from genomicode import alignlib from Betsy import module_utils as mlib bam_filenames = mlib.find_bam_files(in_data.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "bam2fastx (unknown version)" # Somehow bam2fastx doesn't work if there are spaces in the # filename. Make a temporary filename with no spaces, and # then rename it later. # Actually, may not be bam2fastx's fault. jobs = [] for i, bam_filename in enumerate(bam_filenames): p, f, e = mlib.splitpath(bam_filename) #bai_filename = alignlib.find_bai_file(bam_filename) #assert bai_filename, "Missing index for: %s" % bam_filename #temp_bam_filename = "%d.bam" % i #temp_bai_filename = "%d.bam.bai" % i #temp_fa_filename = "%d.fa" % i fa_filename = os.path.join(out_path, "%s.fa" % f) x = filelib.GenericObject( bam_filename=bam_filename, #bai_filename=bai_filename, #temp_bam_filename=temp_bam_filename, #temp_bai_filename=temp_bai_filename, #temp_fa_filename=temp_fa_filename, fa_filename=fa_filename) jobs.append(x) bam2fastx = mlib.findbin("bam2fastx") # Link all the bam files. #for j in jobs: # assert not os.path.exists(j.temp_bam_filename) # #assert not os.path.exists(j.temp_bai_filename) # os.symlink(j.bam_filename, j.temp_bam_filename) # #os.symlink(j.bai_filename, j.temp_bai_filename) commands = [] for j in jobs: # bam2fastx -A --fasta -o rqc14.fa rqc11.bam x = [ mlib.sq(bam2fastx), "-A", "--fasta", #"-o", mlib.sq(j.temp_fa_filename), #mlib.sq(j.temp_bam_filename), "-o", mlib.sq(j.fa_filename), mlib.sq(j.bam_filename), ] x = " ".join(x) commands.append(x) metadata["commands"] = commands metadata["num_cores"] = num_cores parallel.pshell(commands, max_procs=num_cores) #for j in jobs: # # Move the temporary files to the final location. # shutil.move(j.temp_fa_filename, j.fa_filename) # # Remove the link to the BAM file. # os.unlink(j.temp_bam_filename) x = [j.fa_filename for x in jobs] filelib.assert_exists_nz_many(x) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from genomicode import alignlib from Betsy import module_utils as mlib import call_somatic_varscan bam_node, nc_node, ref_node, interval_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) assert bam_filenames, "No .bam files." nc_match = mlib.read_normal_cancer_file(nc_node.identifier) ref = alignlib.create_reference_genome(ref_node.identifier) filelib.assert_exists_nz(interval_node.identifier) filelib.safe_mkdir(out_path) metadata = {} # TODO: Figure out GATK version. # Make sure intervals file ends with: # .bed, .list, .picard, .interval_list, or .intervals x, x, ext = mlib.splitpath(interval_node.identifier) assert ext in [ ".bed", ".list", ".picard", ".interval_list", ".intervals" ] cosmic_file = mlib.get_user_option(user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True) dbsnp_file = mlib.get_user_option(user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True) # sample -> bam filename sample2bamfile = mlib.root2filename(bam_filenames) # Make sure files exist for all the samples. mlib.assert_normal_cancer_samples(nc_match, sample2bamfile) opj = os.path.join jobs = [] for (normal_sample, cancer_sample) in nc_match: normal_bamfile = sample2bamfile[normal_sample] cancer_bamfile = sample2bamfile[cancer_sample] path, sample, ext = mlib.splitpath(cancer_bamfile) vcf_outfile = opj(out_path, "%s.vcf" % sample) log_outfile = opj(out_path, "%s.log" % sample) x = filelib.GenericObject(normal_sample=normal_sample, cancer_sample=cancer_sample, normal_bamfile=normal_bamfile, cancer_bamfile=cancer_bamfile, vcf_outfile=vcf_outfile, log_outfile=log_outfile) jobs.append(x) # java -jar GenomeAnalysisTK.jar \ # -T MuTect2 \ # -R reference.fasta \ # -I:tumor tumor.bam \ # -I:normal normal.bam \ # [--dbsnp dbSNP.vcf] \ # [--cosmic COSMIC.vcf] \ # [-L targets.interval_list] \ # -o output.vcf # Generate the commands. sq = mlib.sq commands = [] for j in jobs: UNHASHABLE = [ ("I:normal", sq(normal_bamfile)), ("I:tumor", sq(cancer_bamfile)), # --dbsnp and --cosmic use two dashes, for some # reason. Since make_GATK_command only uses one dash, # add one manually. ("-dbsnp", sq(dbsnp_file)), ("-cosmic", sq(cosmic_file)), ] x = alignlib.make_GATK_command( T="MuTect2", R=sq(ref.fasta_file_full), L=sq(interval_node.identifier), o=sq(j.vcf_outfile), _UNHASHABLE=UNHASHABLE, ) x = "%s >& %s" % (x, j.log_outfile) commands.append(x) assert len(commands) == len(jobs) nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores) parallel.pshell(commands, max_procs=nc) metadata["num_cores"] = nc metadata["commands"] = commands # Make sure log files have no errors. Check the log files # before the VCF files. If there's an error, the VCF files # may not be created. # ##### ERROR ------------------------------------------------------- # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68 # ##### ERROR # ##### ERROR Please visit the wiki to see if this is a known problem # ##### ERROR If not, please post the error, with stack trace, to the # ##### ERROR Visit our website and forum for extensive documentation # ##### ERROR commonly asked questions http://www.broadinstitute.org/ # ##### ERROR # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison # ##### ERROR ------------------------------------------------------- for i, j in enumerate(jobs): # Pull out the error lines. x = [x for x in open(j.log_outfile)] x = [x for x in x if x.startswith("##### ERROR")] x = "".join(x) msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i], x) assert not x, msg # Make sure output VCF files exist. x = [x.vcf_outfile for x in jobs] filelib.assert_exists_many(x) # Mutect2 names the samples "NORMAL" and "TUMOR". Replace # them with the actual names. for j in jobs: call_somatic_varscan._fix_normal_cancer_names( j.vcf_outfile, j.normal_sample, j.cancer_sample) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import arrayio from genomicode import filelib from Betsy import bie3 from Betsy import rulebase from Betsy import read_label_file cls_node, data_node = antecedents M = arrayio.read(data_node.identifier) x = read_label_file.read(cls_node.identifier) a, training_label, second_line = x predict_model = __import__( 'Betsy.modules.' + 'classify_with_random_forest', globals(), locals(), ['classify_with_random_forest'], -2) evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction', globals(), locals(), ['evaluate_prediction'], -2) full_index = range(M.ncol()) f = file(outfile, 'w') f.write('\t'.join([ 'sample_name', 'Predicted_class', 'Confidence', 'Actual_class', 'Correct?' ])) f.write('\n') for i in range(M.ncol()): # Make filenames # gene expression for N samples. merge_file = 'merge' + '_' + str(i) # class label file for the training samples (samples 1-(N-1)). train_label = 'train_label' + '_' + str(i) # class label file for the test sample (sample N). test_label = 'test_label' + '_' + str(i) # Save the output of the prediction and evaluation. predict_file = "predict.txt" evaluate_file = "evaluate.txt" test_index = i train_index = full_index[:] train_index.remove(test_index) merge_index = train_index + [test_index] y_training = [training_label[x] for x in train_index] y_test = [training_label[test_index]] # Write the files for this iteration. M_merge = M.matrix(None, merge_index) arrayio.gct_format.write(M_merge, open(merge_file, 'w')) read_label_file.write(train_label, second_line, y_training) read_label_file.write(test_label, second_line, y_test[0]) # Make objects to be used in this analysis. x = rulebase.SignalFile.output(format='gct', contents='class0,class1,test') merge_data = bie3.IdentifiedDataNode(x, identifier=merge_file) x = rulebase.ClassLabelFile.output(contents='class0,class1') train_label_data = bie3.IdentifiedDataNode(x, identifier=train_label) x = rulebase.ClassLabelFile.output(contents='test') test_label_data = bie3.IdentifiedDataNode(x, identifier=test_label) # Make a fake object to pass to evaluate_model.run. out_node = filelib.GenericObject() out_node.identifier = predict_file # Run the predictions. x = train_label_data, merge_data predict_model.Module().run(network, x, out_attributes, user_options, num_cores, predict_file) # Run the evaluation. new_parameters = out_attributes.copy() x = test_label_data, out_node evaluate_model.Module().run(network, x, new_parameters, user_options, num_cores, evaluate_file) # Is this the right line? lines = open(evaluate_file).readlines() f.write(lines[1]) os.remove(merge_file) os.remove(train_label) os.remove(test_label) os.remove(predict_file) os.remove(evaluate_file) f.close()
def main(): import os import sys import time import argparse import shutil from genomicode import parselib from genomicode import filelib from Betsy import config from Betsy import rule_engine from Betsy import module_utils as mlib parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", default=0, action="count") parser.add_argument("--running", "--run", dest="running", action="store_true", help="Show only running processes.") parser.add_argument("--broken", action="store_true", help="Show only broken processes.") #parser.add_argument( # "--clean_broken", action="store_true", # help="Remove all broken analyses.") parser.add_argument( "--clear_cache", help="Clear out old analyses in the cache. Argument is the " "amount of bytes to be cleared. Examples: 1000, 1Tb, 500G, 1024Mb.") parser.add_argument( "--dry_run", action="store_true", help="Used for --clear_acche. Just show the directories to clear " "rather than actually clearing them.") parser.add_argument( "--ls", action="store_true", help="Show the modules in the BETSY cache, sorted by decreasing " "modification time.") parser.add_argument( "--cd", help="Show a current working directory to a module in the BETSY " "cache. If the argument is a number (e.g. --goto <num>), will set " "the directory to the <num>th most recently created module. " "If the argument is a string, will set the directory to the most " "recently created module whose directory name contains that string.") args = parser.parse_args() args.clean_broken = False output_path = config.CACHE_PATH if not os.path.exists(output_path): return output_path = os.path.realpath(output_path) assert not (args.ls and args.cd) if args.ls: list_directory(output_path) return if args.cd: change_directory(output_path, args.cd) return print "BETSY cache path: %s" % output_path print bytes_to_clear = None if args.clear_cache: bytes_to_clear = parse_clear_cache(args.clear_cache) #print "Clearing %d bytes" % bytes_to_clear # GenericObject with path (full path), size, status, last_accessed. path_info = [] # Don't sort. Just print as it comes out for speed. for x in os.listdir(output_path): if x.startswith("tmp"): # OBSOLETE continue x = os.path.join(output_path, x) if not os.path.isdir(x): continue path = x p, f = os.path.split(path) # index_bam_folder__B006__617b92ee4d313bcd0148b1ab6a91b12f x = f.split("__") if len(x) != 3: print "Unrecognized path: %s" % path continue module_name, version, hash_ = x # Format the directory size. size = mlib.get_dirsize(path) # See if this module is still running. f = os.path.join(path, rule_engine.IN_PROGRESS_FILE) IN_PROGRESS = os.path.exists(f) if args.running and not IN_PROGRESS: continue # Read the parameter file. params = {} x = os.path.join(path, rule_engine.BETSY_PARAMETER_FILE) if os.path.exists(x): params = rule_engine._read_parameter_file(x) assert params.get("module_name", module_name) == module_name # Figure out the state of this module. status = None start_time = None if params: status = S_DONE start_time = params.get("start_time") assert start_time, "Missing: start_time" time_ = time.strptime(start_time, rule_engine.TIME_FMT) #time_str = time.strftime("%a %m/%d %I:%M %p", start_time) run_time = params.get("elapsed_pretty") if not run_time: run_time = "unknown" #assert run_time, "Missing elapsed_pretty: %s" % path #if run_time == "instant": # x = "ran instantly" #else: # x = "took %s" % run_time elif IN_PROGRESS: status = S_RUNNING # Get time that path was created. time_ = time.localtime(os.path.getctime(path)) run_time = None #time_str = time.strftime("%a %m/%d %I:%M %p", x) else: # Get time that path was created. status = S_BROKEN time_ = time.localtime(os.path.getctime(path)) #time_ = time.localtime(create_time) #time_ = time.strftime("%a %m/%d %I:%M %p", x) run_time = None if args.broken and status != S_BROKEN: continue # Figure out the last accessed time. last_accessed = None # seconds since epoch x = os.path.join(path, rule_engine.LAST_ACCESSED_FILE) if os.path.exists(x): last_accessed = os.path.getmtime(x) # If I can't find the LAST_ACCESSED_FILE, then use the # parameters file. x = os.path.join(path, rule_engine.BETSY_PARAMETER_FILE) if not last_accessed and os.path.exists(x): last_accessed = os.path.getmtime(x) # Otherwise, use the path time. if not last_accessed: last_accessed = os.path.getmtime(path) # Update sizes. x = filelib.GenericObject(module_name=module_name, path=path, time_=time_, size=size, status=status, last_accessed=last_accessed, hash_=hash_, run_time=run_time) path_info.append(x) # Print out the time stamp and state. if not args.clear_cache: x = format_module_summary(x) parselib.print_split(x, prefixn=2) if status == S_DONE and args.verbose >= 1: # Print out the has stuff. hash_lines = [] for name, value in params["hash"]: x = "%s=%s" % (name, value) hash_lines.append(x) if hash_lines: print " HASH:" for x in hash_lines: parselib.print_split(x, prefix1=4, prefixn=6) if status == S_RUNNING and args.verbose >= 1: # Print out the files in the directory. for x in os.walk(path): dirpath, dirnames, filenames = x filenames = [os.path.join(dirpath, x) for x in filenames] all_files = [] # tuple of (mod time, relative_file, filename) for filename in filenames: file_ = os.path.relpath(filename, path) if file_ == rule_engine.IN_PROGRESS_FILE: continue mtime = os.path.getmtime(filename) all_files.append((mtime, file_, filename)) # Sort by decreasing modification time. schwartz = [(-x[0], x) for x in all_files] schwartz.sort() all_files = [x[-1] for x in schwartz] for (mtime, relfile, filename) in all_files: x = time.localtime(mtime) mtime = time.strftime("%a %m/%d %I:%M %p", x) x = os.path.getsize(filename) size = parselib.pretty_filesize(x) x = "[%s] %s (%s)" % (mtime, relfile, size) parselib.print_split(x, prefix1=2, prefixn=4) # Print out the metadata. metadata = params.get("metadata", {}) if args.verbose >= 1: for key, value in metadata.iteritems(): if key in ["commands"]: continue x = "%s: %s" % (key.upper(), value) parselib.print_split(x, prefix1=2, prefixn=4) if args.verbose >= 2: for x in metadata.get("commands", []): x = "COMMAND: %s" % x parselib.print_split(x, prefix1=2, prefixn=4) #print " %s" % x if status == S_BROKEN and args.clean_broken: shutil.rmtree(path) sys.stdout.flush() # Figure out which paths to delete. if args.clear_cache: assert bytes_to_clear # Figure out which paths symlink into other paths. real2links = {} # real path -> list of symlinks that point to it for p in path_info: # Make a list of all the files under this path. all_filenames = [] for x in os.walk(p.path): dirpath, dirnames, files = x x = [os.path.join(dirpath, x) for x in files] all_filenames.extend(x) # Follow the symlinks. all_filenames = [x for x in all_filenames if os.path.islink(x)] all_filenames = [os.path.realpath(x) for x in all_filenames] # Look at whether any of these files are in other paths. for filename in all_filenames: for x in path_info: if x == p: continue if not filename.startswith(x.path): continue if x.path not in real2links: real2links[x.path] = [] if p.path not in real2links[x.path]: real2links[x.path].append(p.path) # Make a list of the paths that we can't delete. # Don't delete any path that is running. cant_delete = [x for x in path_info if x.status == S_RUNNING] # If we can't delete a path, then we also can't delete any # path with a real file that it symlinks into (because then # this path would be broken). for real_path, linked_paths in real2links.iteritems(): if real_path in cant_delete: continue p = [x for x in linked_paths if x in cant_delete] if p: cant_delete.append(real_path) # Sort the paths by priority. x = path_info x = [x for x in x if x not in cant_delete] schwartz = [(get_clear_priority(x), x) for x in x] schwartz.sort() x = [x[-1] for x in schwartz] prioritized = x # Add up the sizes until I reach the desired output. to_delete = [] num_bytes = 0 for i in range(len(prioritized)): if num_bytes >= bytes_to_clear: break to_delete.append(prioritized[i]) num_bytes += prioritized[i].size # Delete the directories. paths_to_delete = [] for info in to_delete: x = format_module_summary(info) parselib.print_split(x, prefixn=2) if not args.dry_run: shutil.rmtree(info.path) i = path_info.index(info) path_info.pop(i) # Also delete an path with symlinks into here. x = real2links.get(info.path, []) paths_to_delete.extend(x) # Delete any of the extra paths (from symlinks). for path in paths_to_delete: found = False for i in range(len(path_info)): if path_info[i].path == path: found = True break # If already deleted, then ignore. if not found: continue if not args.dry_run: shutil.rmtree(path_info[i].path) path_info.pop(i) # BUG: Does not account for size in tmp directories. x = [x.size for x in path_info] total_size = sum(x) x = parselib.pretty_filesize(total_size) print "Used: %s" % x x = os.statvfs(output_path) free_size = x.f_bavail * x.f_frsize x = parselib.pretty_filesize(free_size) print "Free: %s" % x