def __init__(self, config, dependencies=()): self._root = config.destination self._data = config.database self._samples = config.samples self._sample_keys = self._samples.keys() input_files = set() self._reports = {} for sample, info in self._samples.iteritems(): report = AnalysisReport(config=config, root=os.path.join(self._root, sample), has_nuc="Nuc" in info["Files"], has_mt="Mito" in info["Files"]) input_files.update(report.input_files()) self._reports[sample] = report output_prefix = os.path.join(self._root, "summary") Node.__init__(self, description="<SummaryReport -> %r>" % (output_prefix + '.html',), input_files=input_files, output_files=(output_prefix + '.html', output_prefix + '.css'), dependencies=dependencies)
def __init__(self, replicates, output_root, dependencies=()): replicates = tuple(replicates) if not replicates: raise ValueError("No replicates passed to SelectBestAdmixture") input_files = [] ref_filenames = None for node in replicates: filenames = frozenset( os.path.basename(filename) for filename in node.output_files) if ref_filenames is None: ref_filenames = filenames elif ref_filenames != filenames: raise RuntimeError("Node %r does not contain expected files, " "%r, vs %r" % (node, ref_filenames, filenames)) input_files.extend(node.output_files) output_files = [ os.path.join(output_root, filename) for filename in ref_filenames ] self._ref_filenames = ref_filenames self._files = tuple(node.output_files for node in replicates) self._output_root = output_root Node.__init__(self, description="<SelectBestAdmixture -> %r>" % (output_root, ), input_files=input_files, output_files=output_files, dependencies=tuple(dependencies) + tuple(replicates))
def __init__(self, fasta_files, sequences, destination, dependencies=()): """ fasta_files -- { taxon_name_1 : filename_1, ... } sequences -- { interval_name_1, ... } """ self._infiles = copy.deepcopy(fasta_files) self._sequences = utilities.safe_coerce_to_frozenset(sequences) self._destination = copy.copy(destination) self._outfiles = [ os.path.join(destination, name + ".fasta") for name in self._sequences ] input_files = list(self._infiles.values()) for filename in self._infiles.values(): input_files.append(filename + ".fai") desc = "<CollectSequences: %i sequences from %i files -> '%s'>" % ( len(self._sequences), len(self._infiles), self._destination, ) Node.__init__( self, description=desc, input_files=input_files, output_files=self._outfiles, dependencies=dependencies, )
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Detect Input Duplication: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, infiles, out_partitions, partition_by="123", dependencies=()): if (len(partition_by) != 3): raise ValueError("Default 'partition_by' must be 3 entires long!") elif not isinstance(infiles, dict): raise TypeError("'infiles' must be a dictionary") elif any( len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()): raise ValueError("'partition_by' must be 3 entires long!") elif not all(isinstance(dd, dict) for dd in infiles.values()): raise TypeError("'infiles' must be a dictionary of dictionaries") elif not any(("name" in dd) for dd in infiles.values()): raise ValueError("'name' must be specified for all input files") elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()): raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS)) self._infiles = infiles self._out_part = out_partitions self._part_by = partition_by description = "<FastaToPartitions (default: %s): %i file(s) -> '%s'>" % \ (partition_by, len(infiles), out_partitions) Node.__init__(self, description=description, input_files=infiles.keys(), output_files=out_partitions, dependencies=dependencies)
def __init__(self, replicates, output_root, dependencies=()): replicates = tuple(replicates) if not replicates: raise ValueError("No replicates passed to SelectBestAdmixture") input_files = [] ref_filenames = None for node in replicates: filenames = frozenset(os.path.basename(filename) for filename in node.output_files) if ref_filenames is None: ref_filenames = filenames elif ref_filenames != filenames: raise RuntimeError("Node %r does not contain expected files, " "%r, vs %r" % (node, ref_filenames, filenames)) input_files.extend(node.output_files) output_files = [os.path.join(output_root, filename) for filename in ref_filenames] self._ref_filenames = ref_filenames self._files = tuple(node.output_files for node in replicates) self._output_root = output_root Node.__init__(self, description="<SelectBestAdmixture -> %r>" % (output_root,), input_files=input_files, output_files=output_files, dependencies=tuple(dependencies) + tuple(replicates))
def __init__(self, infiles, out_partitions, partition_by = "123", dependencies = ()): if (len(partition_by) != 3): raise ValueError("Default 'partition_by' must be 3 entires long!") elif not isinstance(infiles, dict): raise TypeError("'infiles' must be a dictionary") elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()): raise ValueError("'partition_by' must be 3 entires long!") elif not all(isinstance(dd, dict) for dd in infiles.values()): raise TypeError("'infiles' must be a dictionary of dictionaries") elif not any(("name" in dd) for dd in infiles.values()): raise ValueError("'name' must be specified for all input files") elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()): raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS)) self._infiles = infiles self._out_part = out_partitions self._part_by = partition_by description = "<FastaToPartitions (default: %s): %i file(s) -> '%s'>" % \ (partition_by, len(infiles), out_partitions) Node.__init__(self, description = description, input_files = infiles.keys(), output_files = out_partitions, dependencies = dependencies)
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__(self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)
def __init__(self, config, root, has_nuc, has_mt, dependencies=()): """ Arguments: config -- Config object generated using paleomix.tools.zonkey.config. root -- Root folder containing current analysis. has_nuc -- True if a nuclear BAM was provided. has_mt -- True if a mitochondrial BAM was provided. dependencies -- Nodes for ReportNode to depend on. """ self._root = root self._data = copy.deepcopy(config.database) self._report = AnalysisReport(config, root, has_nuc, has_mt) self._has_nuc = bool(has_nuc) self._has_mt = bool(has_mt) self._treemix_outgroup = config.treemix_outgroup self._treemix_k = config.treemix_k if self._treemix_k is None: self._treemix_k = '<automatic>' Node.__init__(self, description="<Report -> %r>" % (os.path.join(self._root, "report.html"),), input_files=self._report.input_files(), output_files=(os.path.join(self._root, "report.html"), os.path.join(self._root, "report.css")), dependencies=dependencies)
def __init__(self, config, dependencies=()): self._root = config.destination self._data = config.database self._samples = config.samples self._sample_keys = list(self._samples.keys()) input_files = set() self._reports = {} for sample, info in self._samples.items(): report = AnalysisReport( config=config, root=os.path.join(self._root, sample), has_nuc="Nuc" in info["Files"], has_mt="Mito" in info["Files"], ) input_files.update(report.input_files()) self._reports[sample] = report output_prefix = os.path.join(self._root, "summary") Node.__init__( self, description="<SummaryReport -> %r>" % (output_prefix + ".html", ), input_files=input_files, output_files=(output_prefix + ".html", output_prefix + ".css"), dependencies=dependencies, )
def __init__(self, config, root, has_nuc, has_mt, dependencies=()): """ Arguments: config -- Config object generated using paleomix.tools.zonkey.config. root -- Root folder containing current analysis. has_nuc -- True if a nuclear BAM was provided. has_mt -- True if a mitochondrial BAM was provided. dependencies -- Nodes for ReportNode to depend on. """ self._root = root self._data = copy.deepcopy(config.database) self._report = AnalysisReport(config, root, has_nuc, has_mt) self._has_nuc = bool(has_nuc) self._has_mt = bool(has_mt) self._treemix_outgroup = config.treemix_outgroup self._treemix_k = config.treemix_k if self._treemix_k is None: self._treemix_k = '<automatic>' Node.__init__(self, description="<Report -> %r>" % (os.path.join(self._root, "report.html"), ), input_files=self._report.input_files(), output_files=(os.path.join(self._root, "report.html"), os.path.join(self._root, "report.css")), dependencies=dependencies)
def __init__(self, config, output_file, dependencies=()): self._samples = config.database.samples Node.__init__(self, description="<WriteSampleList -> %r>" % (output_file,), input_files=(config.tablefile,), output_files=(output_file,), dependencies=dependencies)
def __init__(self, input_file, output_file, dependencies=()): Node.__init__( self, description="<FreqToTreemix -> %r" % (output_file, ), input_files=(input_file, ), output_files=(output_file, ), dependencies=dependencies, )
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, config, output_file, dependencies=()): self._samples = config.database.samples Node.__init__(self, description="<WriteSampleList -> %r>" % (output_file, ), input_files=(config.tablefile, ), output_files=(output_file, ), dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Validate FASTA Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies) assert len(self.output_files) == 1, self.output_files
def __init__(self, input_files, output_file, dependencies=()): self._output_file = output_file Node.__init__(self, description="<MergeCoverage: %s -> '%s'>" % (describe_files(input_files), self._output_file), input_files=input_files, output_files=self._output_file, dependencies=dependencies)
def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies=()): self._target = target.name self._output_file = os.path.join(config.destination, self._target + ".summary") self._prefixes = makefile["Prefixes"] self._makefile = makefile["Statistics"] self._in_raw_bams = cov_for_lanes self._in_lib_bams = cov_for_libs input_files = set() input_files.update(sum(map(list, self._in_raw_bams.values()), [])) input_files.update(sum(map(list, self._in_lib_bams.values()), [])) self._in_raw_read = collections.defaultdict(list) for prefix in target.prefixes: for sample in prefix.samples: for library in sample.libraries: for lane in library.lanes: filename = None filetype = None if lane.reads.stats: filetype = 'Raw' filename = lane.reads.stats elif lane.reads.validation: filename = lane.reads.validation fileset = set(lane.reads.files) if fileset & _PE_READS and fileset & _SE_READS: filetype = '*' elif fileset & _PE_READS: filetype = 'PE' elif fileset & _SE_READS: filetype = 'SE' else: assert False, lane.reads.files continue input_files.add(filename) else: assert False self._in_raw_read[(sample.name, library.name, lane.name)] = (filetype, filename) Node.__init__(self, description="<Summary: %s>" % self._output_file, input_files=filter(None, input_files), output_files=[self._output_file], dependencies=dependencies)
def __init__(self, infile, outfile, fai_file, amount=0, dependencies=()): self._amount = int(amount) self._infile = infile self._outfile = outfile self._fai_file = fai_file Node.__init__(self, description='<PaddedBed (%i): %r -> %r>' % (amount, infile, outfile), input_files=(infile, fai_file), output_files=(outfile,), dependencies=dependencies)
def __init__(self, reference, bedfile, outfile, dependencies=()): self._reference = reference self._bedfile = bedfile self._outfile = outfile description = "<ExtractReference: '%s' -> '%s'>" \ % (reference, outfile) Node.__init__(self, description=description, input_files=[reference, bedfile], output_files=[outfile], dependencies=dependencies)
def __init__(self, infile, outfile, fai_file, amount=0, dependencies=()): self._amount = int(amount) self._infile = infile self._outfile = outfile self._fai_file = fai_file Node.__init__(self, description='<PaddedBed (%i): %r -> %r>' % (amount, infile, outfile), input_files=(infile, fai_file), output_files=(outfile, ), dependencies=dependencies)
def __init__(self, infiles, out_phy, add_flag=False, dependencies=()): self._add_flag = add_flag self._out_phy = out_phy description = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \ (len(infiles), out_phy, (" (w/ flag)" if add_flag else "")) Node.__init__(self, description=description, input_files=infiles, output_files=[out_phy], dependencies=dependencies)
def __init__(self, infiles, out_phy, add_flag = False, dependencies = ()): self._add_flag = add_flag self._out_phy = out_phy description = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \ (len(infiles), out_phy, (" (w/ flag)" if add_flag else "")) Node.__init__(self, description = description, input_files = infiles, output_files = [out_phy], dependencies = dependencies)
def __init__(self, input_alignment, input_partition, output_alignment, seed = None, dependencies = ()): self._input_phy = input_alignment self._input_part = input_partition self._output_phy = output_alignment self._seed = seed Node.__init__(self, description = "<PHYLIPBootstrap: %r -> %r>" \ % (input_alignment, output_alignment), input_files = (input_alignment, input_partition), output_files = (output_alignment,), dependencies = dependencies)
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [reroot_path(destination, fpath) for fpath in input_files] self._files = zip(input_files, output_files) Node.__init__(self, description = "<Copy %s output to %r>" % (description, destination), input_files = input_files, output_files = output_files, dependencies = source_nodes)
def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies = ()): self._target = target.name self._output_file = os.path.join(config.destination, self._target + ".summary") self._prefixes = makefile["Prefixes"] self._makefile = makefile["Statistics"] self._in_raw_bams = cov_for_lanes self._in_lib_bams = cov_for_libs input_files = set() input_files.update(sum(map(list, self._in_raw_bams.values()), [])) input_files.update(sum(map(list, self._in_lib_bams.values()), [])) self._in_raw_read = collections.defaultdict(list) for prefix in target.prefixes: for sample in prefix.samples: for library in sample.libraries: for lane in library.lanes: filename = None filetype = None if lane.reads.stats: filetype = 'Raw' filename = lane.reads.stats elif lane.reads.validation: filename = lane.reads.validation fileset = set(lane.reads.files) if fileset & _PE_READS and fileset & _SE_READS: filetype = '*' elif fileset & _PE_READS: filetype = 'PE' elif fileset & _SE_READS: filetype = 'SE' else: assert False, lane.reads.files continue input_files.add(filename) else: assert False self._in_raw_read[(sample.name, library.name, lane.name)] = (filetype, filename) Node.__init__(self, description = "<Summary: %s>" % self._output_file, input_files = filter(None, input_files), output_files = [self._output_file], dependencies = dependencies)
def __init__(self, tree_files, output_file, taxa=(), dependencies=()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description=description, input_files=self._tree_files, output_files=self._output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset self._files = set() for (read_type, filename) in input_files.iteritems(): if read_type == "Paired": self._files.add((read_type, filename.format(Pair=1))) self._files.add((read_type, filename.format(Pair=2))) else: self._files.add((read_type, filename)) input_files = [filename for _, filename in self._files] Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, input_alignment, input_partition, output_alignment, seed=None, dependencies=()): self._input_phy = input_alignment self._input_part = input_partition self._output_phy = output_alignment self._seed = seed Node.__init__(self, description = "<PHYLIPBootstrap: %r -> %r>" \ % (input_alignment, output_alignment), input_files = (input_alignment, input_partition), output_files = (output_alignment,), dependencies = dependencies)
def __init__( self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=(), ): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.items(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % ( " (reducing)" if reduce else "", len(infiles), out_prefix, ) Node.__init__( self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies, )
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies=()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description=description, input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [ reroot_path(destination, fpath) for fpath in input_files ] self._files = zip(input_files, output_files) Node.__init__(self, description="<Copy %s output to %r>" % (description, destination), input_files=input_files, output_files=output_files, dependencies=source_nodes)
def __init__(self, input_file, output_file, filter_by, dependencies): self._input_file = input_file self._output_file = output_file self._filter_by = dict(filter_by) for (to_filter, groups) in self._filter_by.items(): # The taxa to be filtered is implied to be part of the group, # but is not needed when actually carrying out the filtering groups = utilities.safe_coerce_to_frozenset(groups) \ - utilities.safe_coerce_to_frozenset(to_filter) if not groups: raise RuntimeError("Singleton filtering must involve at least " "one other taxa") self._filter_by[to_filter] = groups Node.__init__(self, description="<FilterSingleton: '%s' -> '%s'>" % (input_file, output_file), input_files=[input_file], output_files=[output_file], dependencies=dependencies)
def __init__(self, fasta_files, sequences, destination, dependencies=()): """ fasta_files -- { taxon_name_1 : filename_1, ... } sequences -- { interval_name_1, ... } """ self._infiles = copy.deepcopy(fasta_files) self._sequences = utilities.safe_coerce_to_frozenset(sequences) self._destination = copy.copy(destination) self._outfiles = [os.path.join(destination, name + ".fasta") for name in self._sequences] input_files = list(self._infiles.itervalues()) for filename in self._infiles.itervalues(): input_files.append(filename + ".fai") desc = "<CollectSequences: %i sequences from %i files -> '%s'>" \ % (len(self._sequences), len(self._infiles), self._destination) Node.__init__(self, description=desc, input_files=input_files, output_files=self._outfiles, dependencies=dependencies)
def __init__(self, input_file, output_file, dependencies=()): Node.__init__(self, description="<FreqToTreemix -> %r" % (output_file,), input_files=(input_file,), output_files=(output_file,), dependencies=dependencies)