def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = ReadGroup() subtable.Size = size set_in(table, subtable_key, subtable) return subtable
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): key = (target, sample, library) hits, _ = self._read_coverage_tables(key, filenames) value = (hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = size set_in(table, subtable_key, subtable) return subtable
def _read_tables(self, prefixes, genomes): table = {} self._read_reads_settings(table) self._read_raw_bam_stats(table) self._read_lib_bam_stats(table) for (target, samples) in table.items(): merged_samples = {} for (sample, libraries) in samples.items(): merged_libraries = {} for (library, subtables) in libraries.items(): for (tblname, subtable) in subtables.items(): merged_libraries[tblname] = self._merge_tables( (merged_libraries.get(tblname, {}), subtable) ) merged_samples[tblname] = self._merge_tables( (merged_samples.get(tblname, {}), subtable) ) libraries[library] = self._annotate_subtables(subtables, genomes) set_in( table, (target, sample, "*"), self._annotate_subtables(merged_libraries, genomes), ) set_in( table, (target, "*", "*"), self._annotate_subtables(merged_samples, genomes), ) return table
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() gene_type = properties.get("gene_biotype") if gene_type is None: gene_type = properties.get("gene_type", "unknown_genetype") keys = (gene_type, properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = {"contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"]} if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() gene_type = properties.get("gene_biotype") if gene_type is None: gene_type = properties.get("gene_type", "unknown_genetype") keys = (gene_type, properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = { "contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"] } if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def test_set_in__three_kws_in_partial_dictionary(): value = {"Foo": {12: 0}} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {12: 0, 13: {(1, 2): 17}}}) value = {"Foo": {13: {"Bar": None}}} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {13: {(1, 2): 17, "Bar": None}}})
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): key = (target, sample, library) hits, nts = self._read_coverage_tables(key, filenames) value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.items(): key = (target, sample, library) hits, _ = self._read_coverage_tables(key, filenames) set_in( table, (target, sample, library, genome, "hits_raw(%s)" % genome), hits )
def add_connection(self, node_id_a, node_id_b, blength=None): if (blength is not None) and float(blength) < 0: raise GraphError("Branch-lengths must be non-negative") elif (blength is not None) != self.has_branch_lengths: if self.has_branch_lengths is not None: raise GraphError( "Tree contains branches with and without lengths") self.has_branch_lengths = blength is not None set_in(self.connections, (node_id_a, node_id_b), blength) set_in(self.connections, (node_id_b, node_id_a), blength)
def _read_reads_settings(self, table): for ((sample, library, barcode), (filetype, filename)) in self._in_raw_read.iteritems(): key = (self._target, sample, library, "reads", barcode) set_in(table, key, self._stat_read_settings(filetype, filename)) for (_, samples) in table.iteritems(): for (sample, libraries) in samples.iteritems(): for (library, prefixes) in libraries.iteritems(): prefixes["reads"] = self._merge_tables(prefixes["reads"].values()) return table
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update( self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): key = (target, sample, library) hits, _ = self._read_coverage_tables(key, filenames) value = ( hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def _read_reads_settings(self, table): for ((sample, library, barcode), (filetype, filename)) in self._in_raw_read.iteritems(): key = (self._target, sample, library, "reads", barcode) set_in(table, key, self._stat_read_settings(filetype, filename)) for (_, samples) in table.iteritems(): for (sample, libraries) in samples.iteritems(): for (library, prefixes) in libraries.iteritems(): prefixes["reads"] = self._merge_tables( prefixes["reads"].values()) return table
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): key = (target, sample, library) hits, nts = self._read_coverage_tables(key, filenames) value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in( table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.items(): key = (target, sample, library) hits, nts = self._read_coverage_tables(key, filenames) set_in( table, (target, sample, library, genome, "hits_unique(%s)" % genome), hits, ) set_in( table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), nts, )
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = int(record["Size"]) set_in(table, key, subtable) assert int(subtable["Size"]) == int(record["Size"]) for key in READGROUP_TEMPLATE: if key != "Size": subtable[key] += int(record.get(key, 0))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = ReadGroup() subtable.Size = int(record["Size"]) set_in(table, key, subtable) assert int(subtable.Size) == int(record["Size"]) for key in ReadGroup.__slots__: if key != "Size": subtable[key] += int(record.get(key, 0))
def write_records(self, records): record_cache = {} for record in records: num = 0 if record.is_read1: num = 1 elif record.is_read2: num = 2 set_in(record_cache, (record.qname, num), record) for pair in record_cache.itervalues(): # Only write complete pairs if (1 in pair) and (2 in pair): self._sink_pe_1.write_records([pair.pop(1)]) self._sink_pe_2.write_records([pair.pop(2)]) # Any orphan files are written to the SE sink for record in pair.itervalues(): self._sink_se.write_records([record])
def _read_tables(self, prefixes, genomes): table = {} self._read_reads_settings(table) self._read_raw_bam_stats(table) self._read_lib_bam_stats(table) for (target, samples) in table.items(): merged_samples = {} for (sample, libraries) in samples.items(): merged_libraries = {} for (library, subtables) in libraries.items(): for (tblname, subtable) in subtables.items(): merged_libraries[tblname] = self._merge_tables((merged_libraries.get(tblname, {}), subtable)) merged_samples[tblname] = self._merge_tables((merged_samples.get(tblname, {}), subtable)) libraries[library] = self._annotate_subtables(subtables, genomes) set_in(table, (target, sample, "*"), self._annotate_subtables(merged_libraries, genomes)) set_in(table, (target, "*", "*"), self._annotate_subtables(merged_samples, genomes)) return table
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = ( record["Name"], record["Sample"], record["Library"], record["Contig"], ) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = ReadGroup() subtable.Size = int(record["Size"]) set_in(table, key, subtable) assert int(subtable.Size) == int(record["Size"]) for key in ReadGroup.__slots__: if key != "Size": subtable[key] += int(record.get(key, 0))
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Singleton") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def calculate_totals(table): lengths = {} for samples in table.values(): for libraries in samples.values(): for contigs in libraries.values(): for (name, contig) in contigs.items(): size = lengths.get(name) if (size is not None) and (size != contig.Size): raise BAMStatsError(name) lengths[name] = contig.Size for (name, samples) in sorted(table.items()): for (sample, libraries) in sorted(samples.items()): for (library, contigs) in sorted(libraries.items()): totals = _calculate_totals_in(contigs, lengths) set_in(table, (name, sample, library), totals) totals = _calculate_totals_in(libraries, lengths) set_in(table, (name, sample, "*"), totals) set_in(table, (name, "*", "*"), _calculate_totals_in(table, lengths)) return table
def calculate_totals(table): lengths = {} for samples in table.itervalues(): for libraries in samples.values(): for contigs in libraries.values(): for (name, contig) in contigs.iteritems(): size = lengths.get(name) if (size is not None) and (size != contig["Size"]): raise BAMStatsError(name) lengths[name] = contig["Size"] for (name, samples) in sorted(table.items()): for (sample, libraries) in sorted(samples.items()): for (library, contigs) in sorted(libraries.items()): totals = _calculate_totals_in(contigs, lengths) set_in(table, (name, sample, library), totals) totals = _calculate_totals_in(libraries, lengths) set_in(table, (name, sample, "*"), totals) set_in(table, (name, "*", "*"), _calculate_totals_in(table, lengths)) return table
def test_set_in__fail_on_invalid_sub_dictionary_third_level(): utils.set_in({1: {2: None}}, [1, 2, 3], 17)
def test_set_in__fail_on_invalid_sub_dictionary_second_level(): utils.set_in({1: None}, [1, 2], 17)
def test_set_in__fail_on_invalid_sub_dictionary_first_level(): utils.set_in(None, [1], 17)
def test_set_in__fail_on_no_kws(): utils.set_in({}, [], 17)
def test_set_in__update_value_two_kw(): value = {1: {2: 3}} utils.set_in(value, [1, 2], 365) assert_equal(value, {1: {2: 365}})
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key
def test_set_in__fail_on_invalid_sub_dictionary_first_level(): with pytest.raises(TypeError): utils.set_in(None, [1], 17)
def test_set_in__three_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
def test_set_in__two_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13], 17) assert_equal(value, {"Foo": {13: 17}})
def test_set_in__fail_on_no_kws(): with pytest.raises(ValueError): utils.set_in({}, [], 17)
def test_set_in__iteratable_keywords(): value = {} utils.set_in(value, iter(["Foo", 13, (1, 2)]), 17) assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
def test_set_in__fail_on_invalid_sub_dictionary_second_level(): with pytest.raises(TypeError): utils.set_in({1: None}, [1, 2], 17)
def test_set_in__fail_on_invalid_sub_dictionary_third_level(): with pytest.raises(TypeError): utils.set_in({1: {2: None}}, [1, 2, 3], 17)
def test_set_in__single_kw_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo"], 17) assert_equal(value, {"Foo": 17})
def test_set_in__update_value_one_kw(): value = {1: None} utils.set_in(value, [1], 3.14) assert_equal(value, {1: 3.14})