def construct_hg38_map(n2nl_aln, hg38_bam): """Constructs a map of hg38 position -> sequence alignment position -> MSA position""" # construct sequence alignment position -> MSA position map using the MSA aln_f = Fasta(n2nl_aln) seq_aln_map = defaultdict(dict) for name, seq in aln_f.iteritems(): seq_pos = 0 for aln_pos, x in enumerate(str(seq)): seq_aln_map[name][seq_pos] = aln_pos if x != '-': seq_pos += 1 # find maximum position for reversing negative strand max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()} # construct a hg38 -> sequence positions using the sequences trivially mapped back to hg38 hg38_map = {} for rec in pysam.Samfile(hg38_bam): m = {y: x for x, y in rec.aligned_pairs} # invert positions for negative strand genes if rec.qname in ['NOTCH2', 'NOTCH2NL-A', 'NOTCH2NL-B']: m = {x: max_pos[rec.qname] - y for x, y in m.iteritems()} hg38_map[rec.qname] = m # construct a table mapping each alignment position to all hg38 positions r = defaultdict(dict) for name, pos_map in hg38_map.iteritems(): for hg38_pos, seq_pos in pos_map.iteritems(): aln_pos = seq_aln_map[name][seq_pos] r[name][aln_pos] = hg38_pos # now invert this map, so that we have our hg38 -> aln map final_map = {} for name in r: for aln_pos in r[name]: hg38_pos = r[name][aln_pos] assert hg38_pos not in final_map final_map[hg38_pos] = aln_pos return final_map
from collections import * from tools.intervals import * from tools.misc import * from tools.procOps import * from tools.fileOps import * from tools.bio import * from tools.psl import * from itertools import * import bisect # In[5]: # first, construct a map of sequence positions to alignment positions aln_f = Fasta('notch2nl_alignment.fa') seq_aln_map = defaultdict(dict) for name, seq in aln_f.iteritems(): seq_pos = 0 for aln_pos, x in enumerate(str(seq)): seq_aln_map[name][seq_pos] = aln_pos if x != '-': seq_pos += 1 # In[182]: # find maximum position for reversing negative strand max_pos = {x: max(y.keys()) for x, y in seq_aln_map.iteritems()} # In[193]: # next, construct a map of hg38 positions to sequence positions using the alignment hg38_map = {}