def _initialize_tables(cls, target_name, intervals, readgroups): subtables = {} for (name, intervals) in intervals.iteritems(): size = sum((end - start) for (_, start, end) in intervals) subtables[name] = { "SE": 0, "PE_1": 0, "PE_2": 0, "Collapsed": 0, "Hits": 0, "M": 0, "I": 0, "D": 0, "Size": size } tables, mapping = {}, {} for rg in readgroups.itervalues(): subtbl_copy = get_in(tables, (target_name, rg["SM"], rg["LB"]), None) if not subtbl_copy: subtbl_copy = copy.deepcopy(subtables) set_in(tables, (target_name, rg["SM"], rg["LB"]), subtbl_copy) mapping[rg["ID"]] = subtbl_copy return tables, mapping
def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): key = (target, sample, library) hits, _ = self._read_coverage_tables(key, filenames) value = (hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def update_gtf_table(table, gtf, scaffolds, contig_prefix): # Workaround for bug in Pysam, which mis-parses individual properties # (e.g. exon_number) if these are not quoted. This does not apply to # asDict, which uses a different parsing implementation (v0.7.8). properties = gtf.asDict() keys = (properties["gene_biotype"], properties["gene_id"], properties["transcript_id"], int(properties["exon_number"]), gtf.feature) record = {"contig": contig_prefix + gtf.contig, "start": gtf.start, # In pysam, 'end' equals the past-the-end position "end": gtf.end - 1, "strand": gtf.strand, "feature": gtf.feature, "transcript": properties["transcript_id"]} if record["contig"] in scaffolds: contig = scaffolds[record["contig"]] record["contig"] = contig["chrom"] record["start"] += int(contig["chromStart"]) record["end"] += int(contig["chromStart"]) assert not get_in(table, keys), keys set_in(table, keys, record)
def create_or_get_subtable(table, subtable_key, size): subtable = get_in(table, subtable_key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = size set_in(table, subtable_key, subtable) return subtable
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): key = (target, sample, library) hits, nts = self._read_coverage_tables(key, filenames) value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def test_set_in__three_kws_in_partial_dictionary(): value = {"Foo": {12: 0}} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {12: 0, 13: {(1, 2): 17}}}) value = {"Foo": {13: {"Bar": None}}} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {13: {(1, 2): 17, "Bar": None}}})
def test_set_in__three_kws_in_partial_dictionary(): value = {"Foo" : {12 : 0 }} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo" : {12: 0, 13: {(1, 2) : 17}}}) value = {"Foo" : {13 : {"Bar" : None }}} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo" : {13: {(1, 2) : 17, "Bar" : None}}})
def add_connection(self, node_id_a, node_id_b, blength = None): if (blength is not None) and float(blength) < 0: raise GraphError("Branch-lengths must be non-negative") elif (blength is not None) != self.has_branch_lengths: if not self.has_branch_lengths is None: raise GraphError("Tree contains branches with and without lengths") self.has_branch_lengths = (blength is not None) set_in(self.connections, (node_id_a, node_id_b), blength) set_in(self.connections, (node_id_b, node_id_a), blength)
def _read_reads_settings(self, table): for ((sample, library, barcode), filename) in self._in_raw_read.iteritems(): key = (self._target, sample, library, "reads", barcode) set_in(table, key, self._stat_read_settings(filename)) for (target, samples) in table.iteritems(): for (sample, libraries) in samples.iteritems(): for (library, prefixes) in libraries.iteritems(): prefixes["reads"] = self._merge_tables(prefixes["reads"].values()) return table
def add_connection(self, node_id_a, node_id_b, blength=None): if (blength is not None) and float(blength) < 0: raise GraphError("Branch-lengths must be non-negative") elif (blength is not None) != self.has_branch_lengths: if not self.has_branch_lengths is None: raise GraphError( "Tree contains branches with and without lengths") self.has_branch_lengths = (blength is not None) set_in(self.connections, (node_id_a, node_id_b), blength) set_in(self.connections, (node_id_b, node_id_a), blength)
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update(self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def _collect_clade_from(self, cache, p_node, c_node): c_clade = get_in(cache, (p_node, c_node), set()) if not c_clade: if self.is_leaf(c_node): c_clade.add(c_node) for n_node in self.connections[c_node]: if n_node != p_node: c_clade.update( self._collect_clade_from(cache, c_node, n_node)) set_in(cache, (p_node, c_node), frozenset(c_clade)) return c_clade
def _calculate_totals(table): for (name, samples) in sorted(table.items()): for (sample, libraries) in sorted(samples.items()): for (library, contigs) in sorted(libraries.items()): set_in(table, (name, sample, library), _calculate_totals_in(contigs)) set_in(table, (name, sample, "*"), _calculate_totals_in(libraries)) set_in(table, (name, sample, "*", "*", "Size"), get_in(table, (name, sample, library, "*", "Size"))) set_in(table, (name, "*", "*"), _calculate_totals_in(table)) set_in(table, (name, "*", "*", "*", "Size"), get_in(table, (name, sample, "*", "*", "Size")))
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] value = (hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = nts = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key, {"Size": int(record["Size"])}) assert int(subtable["Size"]) == int(record["Size"]) for field in ("Hits", "SE", "PE_1", "PE_2", "Collapsed", "M", "I", "D"): subtable[field] = subtable.get(field, 0) + int( record.get(field, 0)) set_in(table, key, subtable)
def read_table(table, filename): with open(filename) as table_file: for record in parse_padded_table(table_file): key = (record["Name"], record["Sample"], record["Library"], record["Contig"]) if "*" in key: continue subtable = get_in(table, key) if subtable is None: subtable = dict(READGROUP_TEMPLATE) subtable["Size"] = int(record["Size"]) set_in(table, key, subtable) assert int(subtable["Size"]) == int(record["Size"]) for key in READGROUP_TEMPLATE: if key != "Size": subtable[key] += int(record.get(key, 0))
def _read_raw_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] value = ( hits, "# Total number of hits (prior to PCR duplicate filtering)") set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
def write_records(self, records): record_cache = {} for record in records: num = 0 if record.is_read1: num = 1 elif record.is_read2: num = 2 set_in(record_cache, (record.qname, num), record) for pair in record_cache.itervalues(): # Only write complete pairs if (1 in pair) and (2 in pair): self._sink_pe_1.write_records([pair.pop(1)]) self._sink_pe_2.write_records([pair.pop(2)]) # Any orphan files are written to the SE sink for record in pair.itervalues(): self._sink_se.write_records([record])
def _read_tables(self, prefixes, genomes): table = {} self._read_reads_settings(table) self._read_raw_bam_stats(table) self._read_lib_bam_stats(table) for (target, samples) in table.items(): merged_samples = {} for (sample, libraries) in samples.items(): merged_libraries = {} for (library, subtables) in libraries.items(): for (tblname, subtable) in subtables.items(): merged_libraries[tblname] = self._merge_tables((merged_libraries.get(tblname, {}), subtable)) merged_samples[tblname] = self._merge_tables((merged_samples.get(tblname, {}), subtable)) libraries[library] = self._annotate_subtables(subtables, genomes) set_in(table, (target, sample, "*"), self._annotate_subtables(merged_libraries, genomes)) set_in(table, (target, "*", "*"), self._annotate_subtables(merged_samples, genomes)) return table
def _read_lib_bam_stats(self, table): for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems(): subtable = {} for filename in filenames: read_coverage_table(subtable, filename) key = (target, sample, library) hits = nts = 0 for contigtable in get_in(subtable, key).itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] value = (hits, "# Total number of hits (excluding any PCR duplicates)") set_in( table, (target, sample, library, genome, "hits_unique(%s)" % genome), value) set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
def _read_table(self, key, table, filename): all_key = "all" if self._intervals else "genome" with open(filename) as handle: for line in handle: fields = line.split("\t") # 'all' is generated by coverageBed, as a catchall group if (fields[0] == all_key) and (len(fields) == 5): name = None elif self._max_contigs_reached: continue elif (len(fields) > (3 + 4)): # Probably a BED6 file, 4 columns are from bedTools name = fields[3] else: name = fields[0] ckey = key + (name, ) if not get_in(table, ckey, None): set_in(table, ckey, [0] * (_MAX_DEPTH + 1)) depth = min(_MAX_DEPTH, int(fields[-4])) get_in(table, ckey)[depth] += int(fields[-3])
def _read_table(self, key, table, filename): all_key = "all" if self._intervals else "genome" with open(filename) as handle: for line in handle: fields = line.split("\t") # 'all' is generated by coverageBed, as a catchall group if (fields[0] == all_key) and (len(fields) == 5): name = None elif self._max_contigs_reached: continue elif (len(fields) > (3 + 4)): # Probably a BED6 file, 4 columns are from bedTools name = fields[3] else: name = fields[0] ckey = key + (name,) if not get_in(table, ckey, None): set_in(table, ckey, [0] * (_MAX_DEPTH + 1)) depth = min(_MAX_DEPTH, int(fields[-4])) get_in(table, ckey)[depth] += int(fields[-3])
def convert_reads(config, destination, record, sink_cache): # Source name is used, to re-merge split lanes name = record.tags.get("PU_src") destination = os.path.join(destination, name) make_dirs(os.path.join(config.destination, destination)) def _open_se_sink(reads_type): key = (name, reads_type) if not get_in(sink_cache, key): filename = ReadSink.get_filename(destination, reads_type.lower()) set_in(sink_cache, key, ReadSink.open(config.destination, filename)) return key for (reads_type, bam_files) in record.bams.iteritems(): # Processed reads are pre-aligned BAMs which have been cleaned up if reads_type in ("Paired", "Processed"): # Record "Single" reads; these may result from orphan SE reads _open_se_sink("Single") key = (name, "Paired") if not get_in(sink_cache, key): set_in(sink_cache, key, PEReadSink.open(config.destination, destination)) else: key = _open_se_sink(reads_type) sink = get_in(sink_cache, key) for filename in bam_files: print("%sProcessing file %r" % (_INDENTATION * 4, filename)) with pysam.Samfile(filename) as handle: def _keep_record(record): return (record.qual >= config.min_quality) and \ (len(record.seq) >= config.min_length) sink.write_records(record for record in handle if _keep_record(record))
def calculate_totals(table): lengths = {} for samples in table.itervalues(): for libraries in samples.values(): for contigs in libraries.values(): for (name, contig) in contigs.iteritems(): size = lengths.get(name) if (size is not None) and (size != contig["Size"]): raise BAMStatsError(name) lengths[name] = contig["Size"] for (name, samples) in sorted(table.items()): for (sample, libraries) in sorted(samples.items()): for (library, contigs) in sorted(libraries.items()): totals = _calculate_totals_in(contigs, lengths) set_in(table, (name, sample, library), totals) totals = _calculate_totals_in(libraries, lengths) set_in(table, (name, sample, "*"), totals) set_in(table, (name, "*", "*"), _calculate_totals_in(table, lengths)) return table
def test_set_in__fail_on_no_kws(): utils.set_in({}, [], 17)
def test_set_in__update_value_two_kw(): value = {1: {2: 3}} utils.set_in(value, [1, 2], 365) assert_equal(value, {1: {2: 365}})
def test_set_in__update_value_one_kw(): value = {1: None} utils.set_in(value, [1], 3.14) assert_equal(value, {1: 3.14})
def test_set_in__fail_on_invalid_sub_dictionary_first_level(): utils.set_in(None, [1], 17)
def test_set_in__fail_on_invalid_sub_dictionary_third_level(): utils.set_in({1 : {2 : None}}, [1, 2, 3], 17)
def test_set_in__fail_on_invalid_sub_dictionary_third_level(): utils.set_in({1: {2: None}}, [1, 2, 3], 17)
def test_set_in__two_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13], 17) assert_equal(value, {"Foo" : {13: 17}})
def test_set_in__fail_on_invalid_sub_dictionary_second_level(): utils.set_in({1 : None}, [1, 2], 17)
def test_set_in__three_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo" : {13: {(1, 2) : 17}}})
def test_set_in__update_value_one_kw(): value = {1 : None} utils.set_in(value, [1], 3.14) assert_equal(value, {1 : 3.14})
def test_set_in__update_value_two_kw(): value = {1 : {2 : 3}} utils.set_in(value, [1, 2], 365) assert_equal(value, {1 : {2 : 365}})
def test_set_in__fail_on_invalid_sub_dictionary_second_level(): utils.set_in({1: None}, [1, 2], 17)
def test_set_in__single_kw_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo"], 17) assert_equal(value, {"Foo": 17})
def test_set_in__iteratable_keywords(): value = {} utils.set_in(value, iter(["Foo", 13, (1, 2)]), 17) assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
def test_set_in__two_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13], 17) assert_equal(value, {"Foo": {13: 17}})
def test_set_in__iteratable_keywords(): value = {} utils.set_in(value, iter(["Foo", 13, (1, 2)]), 17) assert_equal(value, {"Foo" : {13: {(1, 2) : 17}}})
def test_set_in__three_kws_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo", 13, (1, 2)], 17) assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
def test_set_in__single_kw_in_empty_dictionary(): value = {} utils.set_in(value, ["Foo"], 17) assert_equal(value, {"Foo" : 17})