def _run(self, _config, temp): table = {} for filename in self.input_files: read_table(table, filename) _write_table(table, reroot_path(temp, self._output_file)) move_file(reroot_path(temp, self._output_file), self._output_file)
def _teardown(self, config, temp): temp_filename = reroot_path(temp, self._input_file) os.remove(temp_filename) os.remove(temp_filename + ".bai") move_file(reroot_path(temp, self._output_file), self._output_file) Node._teardown(self, config, temp)
def _run(self, _config, temp): table = {} for filename in self.input_files: coverage.read_table(table, filename) coverage.write_table(table, reroot_path(temp, self._output_file)) move_file(reroot_path(temp, self._output_file), self._output_file)
def _run(self, _config, temp): temp_filename = reroot_path(temp, self._input_file) with pysam.Samfile(temp_filename) as bamfile: intervals = self._get_intervals(bamfile, self._intervals_file, self._max_contigs) readgroups = self._get_readgroups(bamfile) tables, mapping = self._initialize_tables(self._target_name, intervals, readgroups) self.read_records(bamfile, intervals, mapping) tables = self.filter_readgroups(tables) _write_table(tables, reroot_path(temp, self._output_file))
def _teardown(self, config, temp): for postfix in ("ALIGNMENT", "PARTITION"): filenames = [self._kwargs["TEMP_IN_" + postfix], self._kwargs["TEMP_IN_" + postfix] + ".reduced", self._kwargs["OUT_" + postfix]] for (source, destination) in zip(filenames, filenames[1:]): source = fileutils.reroot_path(temp, source) destination = fileutils.reroot_path(temp, destination) if not os.path.exists(destination): fileutils.copy_file(source, destination) os.remove(source) CommandNode._teardown(self, config, temp)
def _run(self, config, temp): try: CommandNode._run(self, config, temp) except NodeError, error: # Allow failures due to low coverage with open(fileutils.reroot_path(temp, "template.stdout")) as handle: codeml = handle.read() if "sequences do not have any resolved nucleotides. Giving up." not in codeml: raise error with open(fileutils.reroot_path(temp, self._output_prefix + ".codeml"), "a") as handle: handle.write("\nWARNING: No resolved nucleotides found, could not process gene.\n") import sys sys.stderr.write("WARNING: No resolved nucleotides in " + self._output_prefix + "\n")
def _teardown(self, config, temp): # Validate output from MAFFT output_file = reroot_path(temp, self._output_file) try: MSA.from_file(output_file) except MSAError, error: raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
def _check_output_files(cls, output_files): """Checks dict of output files to nodes for cases where multiple nodes create the same output file. The directory component of paths are realized in order to detect cases where nodes create the same file, but via different paths (e.g. due to relative/absolute paths, or due to use of symbolic links). Since output files are replaced, not modified in place, it is not nessesary to compare files themselves.""" dirpath_cache, real_output_files = {}, {} for (filename, nodes) in output_files.iteritems(): dirpath = os.path.dirname(filename) if dirpath not in dirpath_cache: dirpath_cache[dirpath] = os.path.realpath(dirpath) real_output_file = reroot_path(dirpath_cache[dirpath], filename) real_output_files.setdefault(real_output_file, []).extend(nodes) for (filename, nodes) in real_output_files.iteritems(): if (len(nodes) > 1): nodes = _summarize_nodes(nodes) yield "Multiple nodes create the same (clobber) output-file:" \ "\n\tFilename: %s\n\tNodes: %s" \ % (filename, "\n\t ".join(nodes))
def _run(self, _config, temp): dest = reroot_path(temp, self.dest) self.inputs = [self.d_bam.baminfo["BamPath"], dest] self._add_options('GCcorrect') self.inputs.extend(["--ReadLength", str(self.rl)]) self.inputs.extend(["--HalfResolution", str(self.halfresolution)]) gccorrect.main(self.inputs)
def _run(self, _config, temp): dest = reroot_path(temp, self.dest) inputs = [self.anal, dest] + self.infiles assert len(inputs) > 2, 'Need at least one output and one input' if self.anal == 'Phasogram': inputs.append('--merge') merge_datafiles.main(inputs)
def _setup(self, config, temp): """See CommandNode._setup.""" infile = os.path.abspath(self._infile) outfile = reroot_path(temp, self._infile) os.symlink(infile, outfile) CommandNode._setup(self, config, temp)
def _teardown(self, config, temp): os.remove(os.path.join(temp, "RAxML_info.output")) source = os.path.join(temp, "RAxML_parsimonyTree.output.0") destination = fileutils.reroot_path(temp, self._output_tree) fileutils.move_file(source, destination) CommandNode._teardown(self, config, temp)
def _teardown(self, config, temp): for postfix in ("ALIGNMENT", "PARTITION"): filenames = [ self._kwargs["TEMP_IN_" + postfix], self._kwargs["TEMP_IN_" + postfix] + ".reduced", self._kwargs["OUT_" + postfix] ] for (source, destination) in zip(filenames, filenames[1:]): source = fileutils.reroot_path(temp, source) destination = fileutils.reroot_path(temp, destination) if not os.path.exists(destination): fileutils.copy_file(source, destination) os.remove(source) CommandNode._teardown(self, config, temp)
def _run(self, _config, temp): alignment = MSA.from_file(self._input_file) for (to_filter, groups) in self._filter_by.iteritems(): alignment = alignment.filter_singletons(to_filter, groups) temp_filename = fileutils.reroot_path(temp, self._output_file) with open(temp_filename, "w") as handle: alignment.to_file(handle) fileutils.move_file(temp_filename, self._output_file)
def _teardown(self, config, temp): template = self._output_template bootstraps = self._bootstrap_num start = self._bootstrap_start for (src_file, dst_file) in self._bootstraps(template, bootstraps, start): src_file = os.path.join(temp, src_file) dst_file = fileutils.reroot_path(temp, dst_file) fileutils.move_file(src_file, dst_file) CommandNode._teardown(self, config, temp)
def _run(self, config, temp): region_names = self._create_tables(config, temp) table = {} for (key, (filename, handle)) in self._tables.iteritems(): handle.close() self._read_table(key, table, filename) temp_filename = reroot_path(temp, self._output_file) self._write_table(table, temp_filename, region_names)
def _run(self, config, temp): try: CommandNode._run(self, config, temp) except NodeError, error: if self._command.join() == [1, None]: with open(fileutils.reroot_path(temp, "template.stdout")) as handle: lines = handle.readlines() if lines and ("Giving up." in lines[-1]): error = NodeError("%s\n\n%s" % (error, lines[-1])) raise error
def _run(self, _config, temp): msas = [] for filename in sorted(self._infiles): split_by = self._infiles[filename].get("partition_by", self._part_by) for (key, msa) in sorted(split_msa(read_msa(filename), split_by).items()): for excluded_group in self._excluded: msa.pop(excluded_group) msas.append(("%s_%s" % (self._infiles[filename]["name"], key), msa)) msa = join_msa(*(msa for (_, msa) in msas)) with open(reroot_path(temp, self._out_prefix + ".phy"), "w") as output: output.write(interleaved_phy(msa, add_flag = self._add_flag)) with open(reroot_path(temp, self._out_prefix + ".partitions"), "w") as output: end = 0 for (name, msa) in msas: length = len(msa.itervalues().next()) output.write("DNA, %s = %i-%i\n" % (name, end + 1, end + length)) end += length
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [read_msa(filename) for filename in sorted(self.input_files)] join_msa(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag = self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def _run(self, _config, temp): # Read and check that MSAs share groups msas = [MSA.from_file(filename) for filename in sorted(self.input_files)] MSA.validate(*msas) blocks = [] for msa in msas: blocks.append(sequential_phy(msa, add_flag = self._add_flag)) with open(reroot_path(temp, self._out_phy), "w") as output: output.write("\n\n".join(blocks))
def _run(self, config, temp): try: CommandNode._run(self, config, temp) except NodeError, error: # Allow failures due to low coverage with open(fileutils.reroot_path(temp, "template.stdout")) as handle: codeml = handle.read() if "sequences do not have any resolved nucleotides. Giving up." not in codeml: raise error with open( fileutils.reroot_path(temp, self._output_prefix + ".codeml"), "a") as handle: handle.write( "\nWARNING: No resolved nucleotides found, could not process gene.\n" ) import sys sys.stderr.write("WARNING: No resolved nucleotides in " + self._output_prefix + "\n")
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write("DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [reroot_path(destination, fpath) for fpath in input_files] self._files = zip(input_files, output_files) Node.__init__(self, description = "<Copy %s output to %r>" % (description, destination), input_files = input_files, output_files = output_files, dependencies = source_nodes)
def __init__(self, input_files, destination, filter_by, dependencies=()): subnodes = [] filter_by = dict(filter_by) for (filename, node) in input_files.iteritems(): output_filename = fileutils.reroot_path(destination, filename) subnodes.append(FilterSingletonsNode(input_file=filename, output_file=output_filename, filter_by=filter_by, dependencies=node)) MetaNode.__init__(self, description="<FilterSingleton: %i files -> '%s'>" % (len(subnodes), destination), subnodes=subnodes, dependencies=dependencies)
def _run(self, _config, temp): merged_msas = [] for (name, files_dd) in sorted(self._infiles.iteritems()): partitions = files_dd["partitions"] msas = dict((key, []) for key in partitions) for filename in files_dd["filenames"]: msa = MSA.from_file(filename) if self._excluded: msa = msa.exclude(self._excluded) for (key, msa_part) in msa.split(partitions).iteritems(): msas[key].append(msa_part) msas.pop("X", None) for (key, msa_parts) in sorted(msas.iteritems()): merged_msa = MSA.join(*msa_parts) if self._reduce: merged_msa = merged_msa.reduce() if merged_msa is not None: merged_msas.append(("%s_%s" % (name, key), merged_msa)) out_fname_phy = reroot_path(temp, self._out_prefix + ".phy") with open(out_fname_phy, "w") as output_phy: final_msa = MSA.join(*(msa for (_, msa) in merged_msas)) output_phy.write(interleaved_phy(final_msa)) partition_end = 0 out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions") with open(out_fname_parts, "w") as output_part: for (name, msa) in merged_msas: length = msa.seqlen() output_part.write( "DNA, %s = %i-%i\n" % (name, partition_end + 1, partition_end + length)) partition_end += length
def _run(self, _config, temp): alignment = msa.read_msa(self._input_file) for (to_filter, groups) in self._filter_by.iteritems(): sequences = [alignment[group] for group in groups] sequence = list(alignment[to_filter]) for (index, nts) in enumerate(zip(*sequences)): nt = sequence[index] if (nt not in "Nn-") and (nts.count(nt) == 1): sequence[index] = 'n' alignment[to_filter] = "".join(sequence) temp_filename = fileutils.reroot_path(temp, self._output_file) msa.write_msa(alignment, temp_filename) fileutils.move_file(temp_filename, self._output_file)
def _run(self, _config, temp): msa = read_msa(self._input_file) for excluded_group in self._excluded: msa.pop(excluded_group) lines = [] lines.append(" %i %i" % (len(msa), len(msa.itervalues().next()))) for (name, seq) in sorted(msa.iteritems()): lines.append("") lines.append(name) for line in fragment(60, seq.upper()): lines.append(" ".join(fragment(3, line))) with open(fileutils.reroot_path(temp, self._output_file), "w") as output: output.write("\n".join(lines))
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [ reroot_path(destination, fpath) for fpath in input_files ] self._files = zip(input_files, output_files) Node.__init__(self, description="<Copy %s output to %r>" % (description, destination), input_files=input_files, output_files=output_files, dependencies=source_nodes)
def _teardown(self, config, temp): temp_filename = reroot_path(temp, self._output_file) move_file(temp_filename, self._output_file) for filename in self._pipes.itervalues(): os.remove(filename) for (filename, _) in self._tables.itervalues(): os.remove(filename) intervals = os.path.join(temp, "intervals.bed") if os.path.exists(intervals): os.remove(intervals) for proc in self._procs.get("cat", ()): proc.commit(temp) if not self._print_stats: os.remove(os.path.join(temp, "pipe_coverage_%i.stdout" % id(self)))
def _run(self, _config, temp): if self._seed is not None: rng = random.Random(self._seed) partitions = _read_partitions(self._input_part) header, names, sequences = _read_sequences(self._input_phy) bootstraps = self._bootstrap_sequences(sequences, partitions, rng) temp_fpath = reroot_path(temp, self._output_phy) with open(temp_fpath, "w") as output_phy: output_phy.write(header) for (name, fragments) in zip(names, bootstraps): output_phy.write(name) output_phy.write(" ") for sequence in fragments: output_phy.write(sequence) output_phy.write("\n") move_file(temp_fpath, self._output_phy)
def _run(self, _config, temp): end = 0 partitions = collections.defaultdict(list) for (filename, msa) in _read_sequences(self._infiles): length = msa.seqlen() start, end = end + 1, end + length for (group, offsets) in self._get_partition_by(filename): if len(offsets) != 3: parts = [("%i-%i\\3" % (start + offset, end)) for offset in offsets] else: parts = ["%i-%i" % (start, end)] name = "%s_%s" % (self._infiles[filename]["name"], group) partitions[name].extend(parts) with open(reroot_path(temp, self._out_part), "w") as part_file: for (name, parts) in sorted(partitions.items()): part_file.writelines("DNA, %s = %s\n" % (name, ", ".join(parts)))
def _run(self, _config, temp): end = 0 partitions = collections.defaultdict(list) for (filename, msa) in _read_sequences(self._infiles): length = len(msa.itervalues().next()) start, end = end + 1, end + length for (group, offsets) in self._get_partition_by(filename): if len(offsets) != 3: parts = [("%i-%i\\3" % (start + offset, end)) for offset in offsets] else: parts = ["%i-%i" % (start, end)] name = "%s_%s" % (self._infiles[filename]["name"], group) partitions[name].extend(parts) with open(reroot_path(temp, self._out_part), "w") as part_file: for (name, parts) in sorted(partitions.items()): part_file.writelines("DNA, %s = %s\n" % (name, ", ".join(parts)))
def _run(self, config, temp): rois = self._stat_areas_of_interest(self._prefixes) genomes = self._stat_prefixes(self._prefixes) with open(reroot_path(temp, self._output_file), "w") as table: table.write("# Command:\n") table.write("# %s\n" % (" ".join(sys.argv)),) table.write("#\n") table.write("# Directory:\n") table.write("# %s\n" % (os.getcwd()),) table.write("#\n") table.write("# Makefile:\n") table.write("# Filename: %s\n" % (self._makefile["Filename"],)) table.write("# SHA1Sum: %s\n" % (self._makefile["Hash"],)) table.write("# MTime: %s\n" % (self._makefile["MTime"],)) table.write("#\n") self._write_genomes(table, genomes) table.write("#\n") self._write_areas_of_interest(table, rois) table.write("#\n#\n") for roi in rois.itervalues(): genomes[roi["Label"]] = {"Size" : roi["Size"]} self._write_tables(table, genomes)
def _run(self, config, temp): rois = self._stat_areas_of_interest(self._prefixes) genomes = self._stat_prefixes(self._prefixes) with open(reroot_path(temp, self._output_file), "w") as table: table.write("# Command:\n") table.write("# %s\n" % (" ".join(sys.argv))) table.write("#\n") table.write("# Directory:\n") table.write("# %s\n" % (os.getcwd())) table.write("#\n") table.write("# Makefile:\n") table.write("# Filename: %s\n" % (self._makefile["Filename"],)) table.write("# SHA1Sum: %s\n" % (self._makefile["Hash"],)) table.write("# MTime: %s\n" % (self._makefile["MTime"],)) table.write("#\n") self._write_genomes(table, genomes) table.write("#\n") self._write_areas_of_interest(table, rois) table.write("#\n#\n") for roi in rois.itervalues(): genomes[roi["Label"]] = {"Size": roi["Size"]} self._write_tables(table, genomes)
def _setup(self, _config, temp): bam_filename = os.path.abspath(self._input_file) temp_filename = reroot_path(temp, bam_filename) os.symlink(bam_filename, temp_filename) os.symlink(swap_ext(bam_filename, ".bai"), temp_filename + ".bai")
def _teardown(self, _config, temp): for destination in sorted(self._outfiles): source = fileutils.reroot_path(temp, destination) fileutils.move_file(source, destination)
def main(argv): config, args = parse_options(argv) if config is None: return 1 # Get default options for bam_pipeline bam_config, _ = bam_cfg.parse_config(args) makefiles = bam_pipeline.read_makefiles(bam_config, args) # Build .fai files for reference .fasta files bam_pipeline.index_references(bam_config, makefiles) for makefile in makefiles: mkfile_fname = makefile["Statistics"]["Filename"] bam_config.destination = os.path.dirname(mkfile_fname) tasks = bam_pipeline.build_pipeline_full(bam_config, makefile, return_nodes=False) make_dirs(config.destination) makefile_name = add_postfix(makefile["Statistics"]["Filename"], config.postfix) makefile_path = reroot_path(config.destination, makefile_name) if samefile(makefile["Statistics"]["Filename"], makefile_path): sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path, )) sys.stderr.write( " Please set --destination and/or --output-name-postfix\n" ) sys.stderr.write(" before continuing.\n") return 1 print("Writing makefile", makefile_path) found_prefix = False for prefix in makefile["Prefixes"]: if prefix != config.prefix: print("%sSkipping %s" % (_INDENTATION, prefix)) else: found_prefix = True if not found_prefix: sys.stderr.write("\nERROR:\n") sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n" % (config.prefix, mkfile_fname)) return 1 with open(makefile_path, "w") as makefile_handle: bam_mkfile.print_header(dst=makefile_handle) makefile_handle.write("\n" * 3) for target in tasks: target_name = add_postfix(target.name, config.postfix) print("%sTarget: %s -> %s" % (_INDENTATION, target.name, target_name)) makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0, target_name)) for prefix in target.prefixes: if prefix.name != config.prefix: continue for sample in prefix.samples: print("%sSample: %s" % (_INDENTATION * 2, sample.name)) makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1, sample.name)) for library in sample.libraries: print("%sLibrary: %s" % (_INDENTATION * 3, library.name)) makefile_handle.write( '%s"%s":\n' % (_INDENTATION * 2, library.name)) sink_cache = {} destination = os.path.join(target_name, "reads", sample.name, library.name) for lane in library.lanes: convert_reads(config, destination, lane, sink_cache) ReadSink.close_all_sinks() for lane_name in sorted(sink_cache): makefile_handle.write( '%s"%s":\n' % (_INDENTATION * 3, lane_name)) for (reads_type, sink) in sorted( sink_cache[lane_name].items()): makefile_handle.write( '%s%s "%s"\n' % (_INDENTATION * 4, ("%s:" % (reads_type, )).ljust(20), sink.filename)) makefile_handle.write("\n") print("\tDone ...") print() return 0
def test_reroot_path__rel_rel__w_final_dash(): assert_equal(reroot_path("etc/apt/", "tmp/sources.list"), "etc/apt/sources.list")
def test_reroot_path__rel_abs__wo_final_dash(): assert_equal(reroot_path("etc/apt", "/tmp/sources.list"), "etc/apt/sources.list")
def _teardown(self, _config, temp): move_file(reroot_path(temp, self._output_file), self._output_file)
def _setup(self, _config, temp): os.symlink(self._in_reference, reroot_path(temp, self._in_reference))
def test_reroot_path__abs_abs__w_final_dash(): assert_equal(reroot_path("/etc/apt/", "/tmp/sources.list"), "/etc/apt/sources.list")
def test_reroot_path__empty_path(): assert_equal(reroot_path("/etc/apt", ""), "/etc/apt/")
def _teardown(self, config, temp): """See CommandNode._teardown.""" os.remove(reroot_path(temp, self._infile)) CommandNode._teardown(self, config, temp)