def _apply_vcf_filter_options(vcffilter, genotyping, sample): filter_cfg = genotyping["VCF_Filter"] apply_options(vcffilter.commands["filter"], filter_cfg) if filter_cfg["MaxReadDepth"][sample]: max_depth = filter_cfg["MaxReadDepth"][sample] vcffilter.commands["filter"].set_option("--max-read-depth", max_depth) return vcffilter.build_node()
def _rescale_quality_scores(self, config, destination, prefix, files_and_nodes): # Generate plot / table files in internal tree in order to prevent the # user from accidentially messing with them / causing re=runs md_directory = "%s.mapDamage" % (destination, ) output_filename = destination + ".rescaled.bam" # Generates basic plots / table files plot = self._build_mapdamage_plot_node(config, md_directory, prefix, files_and_nodes) # Builds model of post-mortem DNA damage model = MapDamageModelNode.customize(reference=prefix["Reference"], directory=md_directory, dependencies=plot) apply_options(model.command, self.options["mapDamage"]) model = model.build_node() # Rescales BAM quality scores using model built above scale = MapDamageRescaleNode.customize( config=config, reference=prefix["Reference"], input_files=files_and_nodes.keys(), output_file=output_filename, directory=md_directory, dependencies=model) apply_options(scale.command, self.options["mapDamage"]) scale = scale.build_node() # Grab indexing and validation nodes validate = index_and_validate_bam(config, prefix, scale) return {output_filename: validate}, plot, model
def _rescale_quality_scores(self, config, destination, prefix, files_and_nodes): # Generate plot / table files in internal tree in order to prevent the # user from accidentially messing with them / causing re=runs md_directory = "%s.mapDamage" % (destination,) output_filename = destination + ".rescaled.bam" # Generates basic plots / table files plot = self._build_mapdamage_plot_node(config, md_directory, prefix, files_and_nodes) # Builds model of post-mortem DNA damage model = MapDamageModelNode.customize(reference = prefix["Reference"], directory = md_directory, dependencies = plot) apply_options(model.command, self.options["mapDamage"]) model = model.build_node() # Rescales BAM quality scores using model built above scale = MapDamageRescaleNode.customize(config = config, reference = prefix["Reference"], input_files = files_and_nodes.keys(), output_file = output_filename, directory = md_directory, dependencies = model) apply_options(scale.command, self.options["mapDamage"]) scale = scale.build_node() # Grab indexing and validation nodes validate = IndexAndValidateBAMNode(config, prefix, scale).subnodes node = MetaNode(description = "Rescale Qualities", subnodes = (plot, model, scale) + tuple(validate), dependencies = files_and_nodes.values()) return {output_filename : node}, plot, model
def _bowtie2_build_cl_tag(options): # Build summary of parameters used by alignment, only including # parameters that affect the output of bowtie2 (as far as possible) cli_tag = ParamCollector("bowtie2", "...") apply_options(cli_tag, _bowtie2_aln_parameters(options)) cli_tag = cli_tag.get_result() _build_mapper_cl_tag(options["Aligners"]["Bowtie2"], cli_tag) return " ".join(map(str, cli_tag)).replace("%", "%%")
def _bowtie2_build_nodes(config, parameters, tags, options): params = Bowtie2Node.customize(threads=config.bowtie2_max_threads, **parameters) apply_options(params.commands["aln"], _bowtie2_aln_parameters(options)) pg_tags = "bowtie2:CL:%s" % (_bowtie2_build_cl_tag(options), ) _set_rg_tags(params.commands["convert"], tags, pg_tags) return params
def _build_mapdamage_plot_node(self, config, destination, prefix, files_and_nodes): plot = MapDamagePlotNode.customize(config = config, reference = prefix["Path"], input_files = files_and_nodes.keys(), output_directory = destination, title = "mapDamage plot for library %r" % (self.name,), dependencies = files_and_nodes.values()) apply_options(plot.command, self.options["mapDamage"]) return plot.build_node()
def _bowtie2_build_nodes(config, parameters, tags, options): params = Bowtie2Node.customize(threads = config.bowtie2_max_threads, **parameters) apply_options(params.commands["aln"], _bowtie2_aln_parameters(options)) pg_tags = "bowtie2:CL:%s" % (_bowtie2_build_cl_tag(options),) _set_rg_tags(params.commands["convert"], tags, pg_tags) return params
def test_apply_options__single_option__user_pred__set_when_pred_is_true(): was_called = [] def _user_pred(key): was_called.append(key) return True mock = flexmock() mock.should_receive('set_option').with_args('FOO_BAR', 17).once() apply_options(mock, {"FOO_BAR": 17}, _user_pred) assert_equal(was_called, ["FOO_BAR"])
def _build_mapdamage_plot_node(self, config, destination, prefix, files_and_nodes): plot = MapDamagePlotNode.customize( config=config, reference=prefix["Path"], input_files=files_and_nodes.keys(), output_directory=destination, title="mapDamage plot for library %r" % (self.name, ), dependencies=files_and_nodes.values()) apply_options(plot.command, self.options["mapDamage"]) return plot.build_node()
def _bwa_build_cl_tag(options): # Build summary of parameters used by alignment, only including # parameters that affect the output of BWA (as far as possible) algorithm = options["Aligners"]["BWA"]["Algorithm"].lower() algorithm = "aln" if algorithm == "backtrack" else algorithm cli_tag = ParamCollector(("bwa", algorithm), "...") apply_options(cli_tag, _bwa_aln_parameters(options)) cli_tag = cli_tag.get_result() _build_mapper_cl_tag(options["Aligners"]["BWA"], cli_tag) return " ".join(map(str, cli_tag)).replace("%", "%%")
def build_genotyping_nodes(options, genotyping, taxa, interval, dependencies): prefix = "{0}.{Genome}.{Name}".format(taxa["Name"], **interval) reference = os.path.join(options.genomes_root, interval["Genome"] + ".fasta") fasta = os.path.join(options.destination, "genotypes", prefix + ".fasta") calls = os.path.join(options.destination, "genotypes", prefix + ".vcf.bgz") pileups = os.path.join(options.destination, "genotypes", prefix + ".vcf.pileup.bgz") filtered = os.path.join(options.destination, "genotypes", prefix + ".filtered.vcf.bgz") padding = genotyping["Padding"] infile = os.path.join(options.samples_root, "%s.%s.bam" % (taxa["Name"], interval["Genome"])) slop, node = build_interval_nodes(options, taxa, interval, padding, dependencies) genotype = GenotypeNode.customize(reference = reference, regions = slop, infile = infile, outfile = calls, dependencies = node) apply_options(genotype.commands["pileup"], genotyping.get("MPileup", {})) apply_options(genotype.commands["genotype"], genotyping.get("BCFTools", {})) genotype = genotype.build_node() vcfpileup = VCFPileupNode.customize(reference = reference, in_bam = infile, in_vcf = calls, outfile = pileups, dependencies = genotype) apply_options(vcfpileup.commands["pileup"], genotyping.get("MPileup", {})) vcfpileup = vcfpileup.build_node() vcffilter = VCFFilterNode.customize(infile = calls, pileup = pileups, outfile = filtered, interval = interval, dependencies = vcfpileup) filter_cfg = genotyping.get("VCF_Filter", {}) apply_options(vcffilter.commands["filter"], filter_cfg) if "MaxReadDepth" in filter_cfg: max_depth = filter_cfg["MaxReadDepth"] if isinstance(max_depth, dict): max_depth = max_depth[taxa["Name"]] vcffilter.commands["filter"].set_option("--max-read-depth", max_depth) if filter_cfg.get("Mappability"): vcffilter.commands["filter"].set_option("--filter-by-mappability", "%(IN_MAPPABILITY)s") vcffilter.commands["filter"].set_kwargs(IN_MAPPABILITY = filter_cfg["Mappability"]) vcffilter = vcffilter.build_node() tabix = TabixIndexNode(infile = filtered, preset = "vcf", dependencies = vcffilter) builder = BuildRegionsNode(options = options, infile = filtered, interval = interval, outfile = fasta, padding = padding, dependencies = tabix) return (builder,)
def _init_raw_reads(self, config, record): version = VERSION_14 if record["Options"]["AdapterRemoval"]["Version"] == "v1.5+": version = VERSION_15 output_format = record["Options"]["CompressionFormat"] output_prefix = os.path.join(self.folder, "reads") files = record["Data"] if ("SE" in files): # This returns a named tuple containing the parameters for the AdapterRemoval Node command = SE_AdapterRemovalNode.customize( input_files=files["SE"], output_prefix=output_prefix, output_format=output_format, version=version) self.files[ "Single"] = output_prefix + ".truncated." + output_format else: command = PE_AdapterRemovalNode.customize( input_files_1=files["PE_1"], input_files_2=files["PE_2"], output_prefix=output_prefix, output_format=output_format, version=version) self.files[ "Paired"] = output_prefix + ".pair{Pair}.truncated." + output_format if version is VERSION_14: self.files[ "Single"] = output_prefix + ".singleton.unaln.truncated." + output_format self.files[ "Collapsed"] = output_prefix + ".singleton.aln.truncated." + output_format else: self.files[ "Single"] = output_prefix + ".singleton.truncated." + output_format self.files[ "Collapsed"] = output_prefix + ".collapsed." + output_format self.files[ "CollapsedTruncated"] = output_prefix + ".collapsed.truncated." + output_format self.stats = output_prefix + ".settings" quality_offset = self.quality_offset # record["Options"]["QualityOffset"] if quality_offset == "Solexa": quality_offset = 64 command.command.set_option("--qualitybase", quality_offset) apply_options(command.command, record["Options"]["AdapterRemoval"]) self.nodes = (command.build_node(), )
def _bwa_build_nodes(config, parameters, tags, options): algorithm = options["Aligners"]["BWA"]["Algorithm"].lower() params = BWANode(threads=config.bwa_max_threads, algorithm=algorithm, **parameters) parameters = dict(_bwa_aln_parameters(options)) # "aln" is used by SE backtrack, mem, and sw; _1 and _2 by PE backtrack for aln_key in ("aln", "aln_1", "aln_2"): if aln_key in params.commands: apply_options(params.commands[aln_key], parameters) pg_tags = "bwa:CL:%s" % (_bwa_build_cl_tag(options), ) _set_rg_tags(params.commands["convert"], tags, pg_tags) return params
def _bwa_build_nodes(config, parameters, tags, options): algorithm = options["Aligners"]["BWA"]["Algorithm"].lower() params = BWANode(threads=config.bwa_max_threads, algorithm=algorithm, **parameters) parameters = dict(_bwa_aln_parameters(options)) # "aln" is used by SE backtrack, mem, and sw; _1 and _2 by PE backtrack for aln_key in ("aln", "aln_1", "aln_2"): if aln_key in params.commands: apply_options(params.commands[aln_key], parameters) pg_tags = "bwa:CL:%s" % (_bwa_build_cl_tag(options),) _set_rg_tags(params.commands["convert"], tags, pg_tags) return params
def _init_raw_reads(self, record): # Support for older versions of the pipeline, which used ARv1.0 - 1.4 version = VERSION_14 if record["Options"]["AdapterRemoval"]["Version"] == "v1.5+": version = VERSION_15 quality_offset = self.quality_offset if quality_offset == "Solexa": quality_offset = 64 ar_options = dict(record["Options"]["AdapterRemoval"]) # Setup of "--collapsed" is handled by the node itself collapse_reads = ar_options.pop("--collapse") collapse_reads = collapse_reads or collapse_reads is None init_args = { "output_prefix": os.path.join(self.folder, "reads"), "output_format": record["Options"]["CompressionFormat"], "quality_offset": quality_offset, "version": version } output_tmpl = "{output_prefix}.%s.{output_format}".format(**init_args) if ("SE" in record["Data"]): self.files["Single"] = output_tmpl % ("truncated", ) init_args["input_files"] = record["Data"]["SE"] command = SE_AdapterRemovalNode.customize(**init_args) else: if version is VERSION_14: self._set_adapterrm_v14_files(self.files, output_tmpl) else: self._set_adapterrm_v15_files(self.files, output_tmpl, collapse_reads) init_args["collapse"] = collapse_reads init_args["input_files_1"] = record["Data"]["PE_1"] init_args["input_files_2"] = record["Data"]["PE_2"] command = PE_AdapterRemovalNode.customize(**init_args) apply_options(command.command, ar_options) self.stats = os.path.join(self.folder, "reads.settings") self.nodes = (command.build_node(), )
def build_msa_nodes(options, settings, regions, filtering, dependencies): if settings["Program"].lower() != "mafft": raise RuntimeError("Only MAFFT support has been implemented!") sequencedir = os.path.join(options.destination, "alignments", regions["Name"]) # Run on full set of sequences sequences = regions["Sequences"][None] node = CollectSequencesNode(fasta_files=regions["Genotypes"], destination=sequencedir, sequences=sequences, dependencies=dependencies) fasta_files = dict((filename, node) for filename in node.output_files) if settings["Enabled"]: fasta_files = {} algorithm = settings["MAFFT"]["Algorithm"] for sequence in sequences: input_file = os.path.join(sequencedir, sequence + ".fasta") output_file = os.path.join(sequencedir, sequence + ".afa") mafft = MAFFTNode.customize(input_file=input_file, output_file=output_file, algorithm=algorithm, dependencies=node) apply_options(mafft.command, settings["MAFFT"]) fasta_files[output_file] = mafft.build_node() node = MetaNode(description="MAFFT", subnodes=fasta_files.values(), dependencies=node) if any(filtering.itervalues()): node = FilterSingletonsMetaNode(input_files=fasta_files, destination=sequencedir + ".filtered", filter_by=filtering, dependencies=node) return node
def build_msa_nodes(options, settings, regions, filtering, dependencies): if settings["Program"].lower() != "mafft": raise RuntimeError("Only MAFFT support has been implemented!") sequencedir = os.path.join(options.destination, "alignments", regions["Name"]) # Run on full set of sequences sequences = regions["Sequences"][None] node = CollectSequencesNode(fasta_files = regions["Genotypes"], destination = sequencedir, sequences = sequences, dependencies = dependencies) fasta_files = dict((filename, node) for filename in node.output_files) if settings["Enabled"]: fasta_files = {} algorithm = settings["MAFFT"]["Algorithm"] for sequence in sequences: input_file = os.path.join(sequencedir, sequence + ".fasta") output_file = os.path.join(sequencedir, sequence + ".afa") mafft = MAFFTNode.customize(input_file = input_file, output_file = output_file, algorithm = algorithm, dependencies = node) apply_options(mafft.command, settings["MAFFT"]) fasta_files[output_file] = mafft.build_node() node = MetaNode(description = "MAFFT", subnodes = fasta_files.values(), dependencies = node) if any(filtering.itervalues()): node = FilterSingletonsMetaNode(input_files = fasta_files, destination = sequencedir + ".filtered", filter_by = filtering, dependencies = node) return node
def _init_raw_reads(self, config, record): version = VERSION_14 if record["Options"]["AdapterRemoval"]["Version"] == "v1.5+": version = VERSION_15 output_format = record["Options"]["CompressionFormat"] output_prefix = os.path.join(self.folder, "reads") files = record["Data"] if ("SE" in files): # This returns a named tuple containing the parameters for the AdapterRemoval Node command = SE_AdapterRemovalNode.customize(input_files = files["SE"], output_prefix = output_prefix, output_format = output_format, version = version) self.files["Single"] = output_prefix + ".truncated." + output_format else: command = PE_AdapterRemovalNode.customize(input_files_1 = files["PE_1"], input_files_2 = files["PE_2"], output_prefix = output_prefix, output_format = output_format, version = version) self.files["Paired"] = output_prefix + ".pair{Pair}.truncated." + output_format if version is VERSION_14: self.files["Single"] = output_prefix + ".singleton.unaln.truncated." + output_format self.files["Collapsed"] = output_prefix + ".singleton.aln.truncated." + output_format else: self.files["Single"] = output_prefix + ".singleton.truncated." + output_format self.files["Collapsed"] = output_prefix + ".collapsed." + output_format self.files["CollapsedTruncated"] = output_prefix + ".collapsed.truncated." + output_format self.stats = output_prefix + ".settings" quality_offset = self.quality_offset # record["Options"]["QualityOffset"] if quality_offset == "Solexa": quality_offset = 64 command.command.set_option("--qualitybase", quality_offset) apply_options(command.command, record["Options"]["AdapterRemoval"]) self.nodes = (command.build_node(),)
def test_apply_options__single_option__user_pred__set_when_pred_is_true(): mock = flexmock() mock.should_receive('set_option').with_args('FOO_BAR', 17).once() apply_options(mock, {"FOO_BAR": 17}, _user_pred)
def test_apply_options__single_option__user_pred__ignore_when_pred_is_false(): mock = flexmock() apply_options(mock, {"BAR_FOO": 17}, _user_pred)
def test_apply_options__single_option__default_pred__set_when_pred_is_true(): mock = flexmock() mock.should_receive('set_option').with_args('--foo', 17).once() apply_options(mock, {"--foo": 17})
def test_apply_options__single_option__default_pred__ignore_when_pred_is_false( ): mock = flexmock() apply_options(mock, {"Other": None})
def test_apply_options__single_option__user_pred__ignore_when_pred_is_false(): mock = flexmock() apply_options(mock, {"BAR_FOO" : 17}, _user_pred)
def test_apply_options__single_option__boolean__set_when_value_is_true(): mock = flexmock() mock.should_receive("set_option").with_args("-v") apply_options(mock, {"-v": True})
def test_apply_options__single_option__default_pred__set_when_pred_is_true(): mock = flexmock() mock.should_receive('set_option').with_args('--foo', 17).once() apply_options(mock, {"--foo" : 17})
def test_apply_options__multiple_option(): mock = flexmock() mock.should_receive('add_option').with_args('--foo', 3).once() mock.should_receive('add_option').with_args('--foo', 17).once() apply_options(mock, {"--foo": [3, 17]})
def _apply_aln_user_parameters(mkfile_params, params, aligners): for aligner_key in aligners: apply_options(params.commands[aligner_key], mkfile_params)
def build_genotyping_nodes(options, genotyping, taxa, interval, dependencies): prefix = "{0}.{Genome}.{Name}".format(taxa["Name"], **interval) reference = os.path.join(options.genomes_root, interval["Genome"] + ".fasta") fasta = os.path.join(options.destination, "genotypes", prefix + ".fasta") calls = os.path.join(options.destination, "genotypes", prefix + ".vcf.bgz") pileups = os.path.join(options.destination, "genotypes", prefix + ".vcf.pileup.bgz") filtered = os.path.join(options.destination, "genotypes", prefix + ".filtered.vcf.bgz") padding = genotyping["Padding"] infile = os.path.join(options.samples_root, "%s.%s.bam" % (taxa["Name"], interval["Genome"])) slop, node = build_interval_nodes(options, taxa, interval, padding, dependencies) genotype = GenotypeNode.customize(reference=reference, regions=slop, infile=infile, outfile=calls, dependencies=node) apply_options(genotype.commands["pileup"], genotyping.get("MPileup", {})) apply_options(genotype.commands["genotype"], genotyping.get("BCFTools", {})) genotype = genotype.build_node() vcfpileup = VCFPileupNode.customize(reference=reference, in_bam=infile, in_vcf=calls, outfile=pileups, dependencies=genotype) apply_options(vcfpileup.commands["pileup"], genotyping.get("MPileup", {})) vcfpileup = vcfpileup.build_node() vcffilter = VCFFilterNode.customize(infile=calls, pileup=pileups, outfile=filtered, interval=interval, dependencies=vcfpileup) filter_cfg = genotyping.get("VCF_Filter", {}) apply_options(vcffilter.commands["filter"], filter_cfg) if "MaxReadDepth" in filter_cfg: max_depth = filter_cfg["MaxReadDepth"] if isinstance(max_depth, dict): max_depth = max_depth[taxa["Name"]] vcffilter.commands["filter"].set_option("--max-read-depth", max_depth) if filter_cfg.get("Mappability"): vcffilter.commands["filter"].set_option("--filter-by-mappability", "%(IN_MAPPABILITY)s") vcffilter.commands["filter"].set_kwargs( IN_MAPPABILITY=filter_cfg["Mappability"]) vcffilter = vcffilter.build_node() tabix = TabixIndexNode(infile=filtered, preset="vcf", dependencies=vcffilter) builder = BuildRegionsNode(options=options, infile=filtered, interval=interval, outfile=fasta, padding=padding, dependencies=tabix) return (builder, )
def test_apply_options__multiple_option(): mock = flexmock() mock.should_receive('add_option').with_args('--foo', 3).once() mock.should_receive('add_option').with_args('--foo', 17).once() apply_options(mock, {"--foo" : [3, 17]})
def test_apply_options__single_option__boolean__pop_when_value_is_false(): mock = flexmock() mock.should_receive('pop_option').with_args('-v') apply_options(mock, {"-v" : False})
def test_apply_options__single_option__boolean__set_when_value_is_none(): mock = flexmock() mock.should_receive('set_option').with_args('-v') apply_options(mock, {"-v" : None})
def test_apply_options__single_option__boolean__set_when_value_is_none(): mock = flexmock() mock.should_receive('set_option').with_args('-v') apply_options(mock, {"-v": None})
def test_apply_options__single_option__user_pred__ignore_when_pred_is_false(): def _user_pred(key): return key.startswith("FOO") mock = flexmock() apply_options(mock, {"BAR_FOO": 17})
def test_apply_options__single_option__boolean__pop_when_value_is_false(): mock = flexmock() mock.should_receive('pop_option').with_args('-v') apply_options(mock, {"-v": False})
def test_apply_options__single_option__default_pred__ignore_when_pred_is_false(): mock = flexmock() apply_options(mock, {"Other" : None})
def test_apply_options__single_option__user_pred__set_when_pred_is_true(): mock = flexmock() mock.should_receive('set_option').with_args('FOO_BAR', 17).once() apply_options(mock, {"FOO_BAR" : 17}, _user_pred)