def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample", not_empty=True) # Set the experiment name. name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample assert control_sample in samples, "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) control_filename = run_MACS14.find_bam_file(bam_path, control_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample assert control_filename, "Missing bam file for %s" % control_sample cmd = make_pyspp_command(treat_filename, control_filename, out_path, num_procs=num_cores) log_file = "%s.log" % experiment_name cmd = "%s >& %s" % (cmd, log_file) parallel.sshell(cmd, path=out_path) files = [ "binding.positions.txt", #"broadPeak", "crosscorrelation.pdf", "density.wig", "enrichment.estimates.wig", "enrichment.wig", #"narrowPeak", # might be empty if no peaks found log_file, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) shiftsize = module_utils.get_user_option(user_options, "macs_shiftsize") if shiftsize: shiftsize = int(shiftsize) # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = find_bam_file(bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs14_command(treat_filename, control_filename, name=name, genome_size=genome_size, shiftsize=shiftsize, save_bedgraph_file=True) parallel.sshell(cmd, path=out_path) # Run Rscript on the model, if one was generated. model_file = os.path.join(out_path, "%s_model.r" % name) if os.path.exists(model_file): Rscript = filelib.which_assert(config.Rscript) cmd = [parallel.quote(Rscript), model_file] parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, "%s_summits.bed" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") fragment_length = module_utils.get_user_option( user_options, "peakseq_fragment_length", not_empty=True, type=int) mappability_file = module_utils.get_user_option(user_options, "mappability_file", not_empty=True, check_file=True) assert fragment_length > 0 and fragment_length < 1000 # Set the experiment name. name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) control_filename = run_MACS14.find_bam_file(bam_path, control_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample assert control_filename, "Missing bam file for %s" % control_sample cmd = make_peakseq_command(treat_filename, control_filename, out_path, experiment_name, fragment_length, mappability_file) log_file = "%s.log" % experiment_name cmd = "%s >& %s" % (cmd, log_file) parallel.sshell(cmd, path=out_path) files = [ "config.dat", log_file, "%s.txt" % experiment_name, # Can be length 0, if no peaks found. #"%s_narrowPeak.txt" % experiment_name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils # This this is I/O heavy, don't use so many cores. MAX_CORES = 2 fastq_node, group_node = antecedents fastq_path = fastq_node.identifier sample_group_file = group_node.identifier filelib.safe_mkdir(out_path) metadata = {} module_utils.assert_sample_group_file(sample_group_file, fastq_path) x = module_utils.read_sample_group_file(group_node.identifier) x = module_utils.fix_sample_group_filenames(x, fastq_path) sample_groups = x # For merging, the order of the files in the sample_group_file # must be maintainted. Otherwise, will be merged out of order. # The new files should be named: # <Sample>.fastq # if single end # <Sample>_<Pair>.fastq # if paired end jobs = [] for x in sample_groups: in_filename, sample, pair = x #in_filename = os.path.join(fastq_path, file_) assert os.path.exists(in_filename) out_file = "%s.fastq" % sample if pair: out_file = "%s_%s.fastq" % (sample, pair) out_filename = os.path.join(out_path, out_file) x = in_filename, sample, pair, out_filename jobs.append(x) out2ins = {} # out_filename -> list of in_filenames for x in jobs: in_filename, sample, pair, out_filename = x if out_filename not in out2ins: out2ins[out_filename] = [] out2ins[out_filename].append(in_filename) commands = [] for out_filename, in_filenames in out2ins.iteritems(): # Debugging. Don't merge again if it already exists. if os.path.exists(out_filename): continue args = in_filenames, out_filename keywds = {} x = merge_or_symlink_files, args, keywds commands.append(x) commands.sort() nc = min(MAX_CORES, num_cores) parallel.pyfun(commands, nc) metadata["num_cores"] = nc # If the files are paired, make sure they are paired # correctly. sample2outfiles = {} # sample -> list of out filenames for x in jobs: in_filename, sample, pair, out_filename = x if sample not in sample2outfiles: sample2outfiles[sample] = [] if out_filename not in sample2outfiles[sample]: sample2outfiles[sample].append(out_filename) commands = [] all_samples = sorted(sample2outfiles) for sample in all_samples: out_filenames = sorted(sample2outfiles[sample]) if len(out_filenames) == 1: continue # Make sure they are aligned. x = check_fastq_alignment, (sample, out_filenames), {} commands.append(x) commands.sort() retvals = parallel.pyfun(commands, nc) assert len(retvals) == len(commands) errors = [x for x in retvals if x] assert not errors, "\n".join(errors) return metadata
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils tag_node, group_node = antecedents tag_path = module_utils.check_inpath(tag_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") # Set the experiment name. experiment_name = treat_sample if control_sample: name1 = hashlib.hash_var(treat_sample) name2 = hashlib.hash_var(control_sample) experiment_name = "%s_vs_%s" % (name1, name2) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample assert control_sample in samples, "Unknown sample: %s" % control_sample # Find the tag directories. treat_path = os.path.join(tag_path, treat_sample) assert os.path.exists(treat_path) if control_sample: control_path = os.path.join(tag_path, control_sample) assert os.path.exists(control_path) # Get the command. homer_path = filelib.which_assert(config.homer_path) x = os.path.join(homer_path, "bin", "findPeaks") assert filelib.exists_nz(x) find_peaks = x log_file = "%s.log" % experiment_name peak_file = "%s.peaks.txt" % experiment_name sq = parallel.quote cmd = [ sq(find_peaks), sq(treat_path), "-style", "factor", ] if control_sample: cmd += ["-i", control_path] cmd = " ".join(cmd) cmd = "%s 2> %s 1> %s" % (cmd, log_file, peak_file) parallel.sshell(cmd, path=out_path) x = os.path.join(out_path, peak_file) filelib.assert_exists_nz(x)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from Betsy import module_utils import run_MACS14 bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) x = module_utils.get_user_option(user_options, "broad_peaks", allowed_values=["no", "yes"]) broad_peaks = (x == "yes") x = module_utils.get_user_option(user_options, "macs_paired", allowed_values=["no", "yes"]) is_paired = (x == "yes") # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = run_MACS14.find_bam_file( bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs2_command(treat_filename, control_filename=control_filename, genome_size=genome_size, save_bedgraph_file=True, name=name, normalize_read_counts=True, paired=is_paired, broad_peak_calling=broad_peaks) parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)