class AslrOracle: def __init__(self): self.queries = 0 self.InitCache() def CheckAddress(self, address): return self.CheckRange(address, 0x1000) def InitCache(self): self.cached_queries = 0 self.good_regions = IntervalTree() self.bad_regions = IntervalTree() def InsertToCache(self, start, end, valid): if valid: self.good_regions.add(Interval(start, end + 1)) self.good_regions.merge_overlaps() else: self.bad_regions.add(Interval(start, end)) def CheckCache(self, start, end): good_overlaps = self.good_regions.overlap(start, end) for overlap in good_overlaps: if (overlap[0] <= start) and (overlap[1] >= end): self.cached_queries += 1 return True bad_overlaps = self.bad_regions.envelop(start, end) if len(bad_overlaps) > 0: self.cached_queries += 1 return False return None
class SimpleDnMedium(DnMedium): def __init__(self) -> None: self.msgs = IntervalTree() def add_dn(self, msg: LoraMsg) -> None: t0 = Simulation.time2ticks(msg.xbeg) t1 = t0 + Simulation.time2ticks(msg.tpreamble()) self.msgs[t0:t1] = msg @staticmethod def overlap(i1: Interval, i2: Interval) -> int: return min(i1.end, i2.end) - max(i1.begin, i2.begin) # type: ignore def get_dn(self, rxon: int, rxtout: int, freq: int, rps: int, nsym: int = 4) -> Optional[LoraMsg]: rxw = Interval(rxon, rxon + rxtout) tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym)) for i in self.msgs.overlap(rxw[0], rxw[1]): m = i.data # type: LoraMsg if m.match(freq, rps) and SimpleDnMedium.overlap(i, rxw) >= tpn: break else: return None self.msgs.remove(i) return m def prune(self, ticks: int) -> None: exp = self.msgs.envelop(0, ticks) if exp: self.msgs.remove_envelop(0, ticks) return exp
def find_candidate(Interval_list, window=10, min_primary=0, min_support=0, secondary_thres=0.0, primary_thres=1.0): ''' Find candidate exon boundary (i.e. intron boundary) within a given range. Parameter: begin: start (left-most) position of the range to be searched (0-based) end: end (right-most) possition of the range to be searched (0-based) tree: IntervalTree containing all boundary pairs window: window size for group surrounding boundaries (difference of boundary in each size of the intron will be grouped together if the absolute difference < window size) min_support: The best supported boundary need will be included only when the num of support reaches the minimum secondary_thres: only the junctions with multiple well supported boundary will be included. Well supported junction is defined as secondary_thres * support num of the most supported boundary. ''' # get boundaries with in searching window, sorted by the number of support intervals_tree = IntervalTree() for interval in Interval_list: intervals_tree.addi(interval.begin, interval.end, interval.data) candidate_boundaries = [] while intervals_tree: interval = max(intervals_tree, key=lambda x: x.data) best_support = interval.data if interval.data < min_primary: # lower bound of the support return candidate_boundaries #candidate_boundaries.append(interval) intervals_tree.remove(interval) # include surrounding boundaries enveloped_interval = intervals_tree.envelop(interval.begin - window, interval.end + window) neighbour_found = [] for i in enveloped_interval: if i.begin <= interval.begin + window and \ i.end >= interval.end - window: if i.data > secondary_thres * best_support: neighbour_found.append((interval, i)) intervals_tree.remove(i) if neighbour_found: neighbour_found.append((interval, interval)) count = sum([x.data for y, x in neighbour_found]) if count >= min_support and best_support / count <= primary_thres: candidate_boundaries += neighbour_found return candidate_boundaries
def original_print(): it = IntervalTree() it.addi(1, 3, "dude") it.addi(2, 4, "sweet") it.addi(6, 9, "rad") for iobj in it: print(it[iobj.begin, iobj.end]) # set(), should be using : for iobj in it: print(it.envelop(iobj.begin, iobj.end))
class SimpleMedium(Medium): def __init__(self, put_up: Optional[Callable[[LoraMsg], None]]) -> None: self._put_up = put_up self.msgs = IntervalTree() def reset_medium(self) -> None: self.msgs.clear() def add_dn(self, msg: LoraMsg) -> None: t0 = Simulation.time2ticks(msg.xbeg) t1 = t0 + Simulation.time2ticks(msg.tpreamble()) self.msgs[t0:t1] = msg @staticmethod def overlap(i1: Interval, i2: Interval) -> int: return min(i1.end, i2.end) - max(i1.begin, i2.begin) # type: ignore def get_dn(self, rxon: int, rxtout: int, freq: int, rps: int, nsym: int = 4, peek=False) -> Optional[LoraMsg]: rxw = Interval(rxon, rxon + rxtout) tpn = Simulation.time2ticks(LoraMsg.symtime(rps, nsym)) for i in self.msgs.overlap(rxw[0], rxw[1]): m = i.data # type: LoraMsg if m.match(freq, rps) and (peek or SimpleMedium.overlap(i, rxw) >= tpn): break else: return None if not peek: self.msgs.remove(i) return m def prune(self, ticks: int) -> List[LoraMsg]: exp = cast(List[Interval], self.msgs.envelop(0, ticks)) if exp: self.msgs.remove_envelop(0, ticks) return [iv[2] for iv in exp]
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.overlap(t.begin(), t.end()) == e assert t.envelop(t.begin(), t.end()) == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.overlap(t.begin(), t.end()) == e assert t.envelop(t.begin(), t.end()) == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
class AmpliconSet: def __init__( self, name, amplicons, tolerance=5, shortname=None, ): """AmpliconSet supports various membership operations""" if not shortname: # base-54 hash self.shortname = chr(((sum(map(ord, name)) - ord("A")) % 54) + 65) self.tree = IntervalTree() self.name = name self.seqs = {} self.amplicons = amplicons self.amplicon_ids = {} primer_lengths = set() sequences = {} for amplicon_name in amplicons: amplicon = amplicons[amplicon_name] self.amplicon_ids[amplicon.shortname] = amplicon for primer in amplicon.left: sequences[primer.seq] = amplicon primer_lengths.add(len(primer.seq)) for primer in amplicon.right: # note: you may want to reverse complement this sequences[primer.seq] = amplicon primer_lengths.add(len(primer.seq)) # interval containment tolerance start = amplicon.start - tolerance end = amplicon.end + tolerance self.tree[start:end] = amplicon self.min_primer_length = min(primer_lengths) # the internal sequences table allows lookup by primer sequence for k, v in sequences.items(): self.seqs[k[:self.min_primer_length]] = v def __eq__(self, other): return type(other) is type(self) and self.__dict__ == other.__dict__ @classmethod def from_json(cls, fn, tolerance=5): raise NotImplementedError @classmethod def from_tsv(cls, fn, name=None, **kwargs): amplicons = {} required_cols = { "Amplicon_name", "Primer_name", "Left_or_right", "Sequence", "Position", } n = 0 with open(fn) as f: reader = csv.DictReader(f, delimiter="\t") missing_cols = required_cols.difference(set(reader.fieldnames)) if len(missing_cols) > 0: missing_cols = ",".join(sorted(list(missing_cols))) raise Exception( f"Amplicon scheme TSV missing these columns: {missing_cols}. Got these columns: {reader.fieldnames}" ) for d in reader: if d["Amplicon_name"] not in amplicons: amplicons[d["Amplicon_name"]] = Amplicon( d["Amplicon_name"], shortname=n) n += 1 left = d["Left_or_right"].lower() == "left" # We assume that primer is always left+forward, or right+reverse forward = left pos = int(d["Position"]) primer = Primer(d["Primer_name"], d["Sequence"], left, forward, pos) amplicons[d["Amplicon_name"]].add(primer) name = fn if not name else name return cls(name, amplicons, **kwargs) def match(self, start, end): """Identify a template's mapped interval based on the start and end positions returns a set of matching amplicons """ # amplicons which contain the start and end hits = self.tree[start].intersection(self.tree[end]) # amplicons contained by the start and end # this should never happen in tiled amplicons enveloped = self.tree.envelop(start, end) if enveloped: return None if len(hits) == 0: return None elif len(hits) > 2: # there should not be any more than 2 ambiguous matches under any # known primer set. The interval tree can confirm this at the time # the primer set is parsed raise Exception else: return [hit.data for hit in hits] def get_tags(self, read): pass def set_tags(self, read): pass
class Protein: """ We will represent a protein as its domains 3 ways: 1) protein can have overlapping domains 2) protein has only no-overlapping domains 3) protein has known length so gap domain can be added """ def __init__(self, with_overlap, with_redundant, with_gap, hit_line="", proteins_id_len="", interpro_local_format=False): """ Protein Class init Parameters ---------- with_overlap : bool output overlapping domain annotation (True), otherwise not overlapping domain annotation will be created (False) with_redundant : bool if with_overlap is False then create non overlapping (but possibly redundant) domains (True), otherwise create non overlapping and non redundant domain annotation (False) with_gap : bool add GAP domain for each protein subsequence >30 amino acids without domain hit (True), otherwise don't add GAP domain (False) hit_line : str domain hits line proteins_id_len : file proteins id length file handle interpro_local_format : bool preprocess output format produced by local interproscan run (True), otherwise preprocess Interpro downloaded protein2ipr format (False) Returns ------- None """ self.with_overlap = with_overlap self.with_redundant = with_redundant self.with_gap = with_gap self.domain_interval_tree = IntervalTree() self.domains_with_gaps = [] self.gap_min_size = 30 self.length = 0 self.interpro_exist_all_domains = [] if hit_line != "": if interpro_local_format: # interpro local run format # get the interpro annotation of protein line based on: # https://github.com/ebi-pf-team/interproscan/wiki/OutputFormats assert len( hit_line.split("\t") ) >= 11, "AssertionError: line: {} has less than 11 tabs.".format( hit_line) self.uniprot_id = self.get_uniprot_id(hit_line) self.domains = {} self.add_domain(hit_line) if with_gap: self.length = int(hit_line.split("\t")[2]) assert self.length > 0, "AssertionError: protein with id {} has length <=0.".format( self.length) else: # prot2ipr format assert isinstance( hit_line, str ), "AssertionError: Input of protein should be a String line." hit_line = hit_line.strip() self.uniprot_id = self.get_uniprot_id(hit_line) self.domains = {} self.add_domain(hit_line) if with_gap: self.length = self.get_prot_length(proteins_id_len) assert self.length > 0, "AssertionError: protein with id {} has length <= 0.".format( self.length) else: self.uniprot_id = "" def get_prot_length(self, proteins_id_len): """ Get protein length Parameters ---------- proteins_id_len : file protein id length file handle Returns ------- prot_len : int protein length """ prot_len = -1 prot_found = False try: while prot_found == False: prot_id_len = next(proteins_id_len) # print("current len:{}".format(prot_id_len)) if prot_id_len.strip().split("\t")[0] == self.uniprot_id: prot_len = int(prot_id_len.strip().split("\t")[1]) prot_found = True except (StopIteration): print("EOF") return prot_len @staticmethod def get_prot_id(hit_line): """ Get protein id Parameters ---------- hit_line : str domain hit line Returns ------- str protein id """ return hit_line.split("\t")[0] def get_uniprot_id(self, hit_line): """ Get uniprot id Parameters ---------- hit_line : str domain hit line Returns ------- str protein id """ return hit_line.split("\t")[0] def add_domain(self, hit_line): """ Add domain in Protein object Parameters ---------- hit_line : str domain hit line Returns ------- None """ if self.with_overlap: self.add_overlap(hit_line) elif self.with_overlap is False or self.with_redundant: self.add_no_overlap(hit_line) def add_overlap(self, hit_line): """ Add domain hit in overlapping fashion Parameters ---------- hit_line : str domain hit line Returns ------- None """ domain = Domain(hit_line) self.interpro_exist_all_domains.append(domain.interpro_id_exists) if domain.end_pos > domain.start_pos: # construct start_stop index start_stop = str(domain.start_pos) + str(domain.end_pos) start_stop = float(start_stop) if start_stop not in self.domains: self.domains[start_stop] = domain else: # allow for 100 domain annotations to have the same start and end start_stop = start_stop + 0.01 self.domains[start_stop] = domain def add_no_overlap(self, hit_line): """ Add domain in no overlapping fashion Parameters ---------- hit_line : str domain hit Returns ------- None """ domain = Domain(hit_line) self.interpro_exist_all_domains.append(domain.interpro_id_exists) if domain.end_pos > domain.start_pos: self.domain_interval_tree.addi(domain.start_pos, domain.end_pos, domain) def to_tabs(self): """ Convert saved domain hits for a protein to output tabular line Parameters ---------- Returns ------- str """ if self.with_overlap: # print("Overlap") return self.to_tabs_overlap() elif self.with_redundant is False: # print("No overlap") return self.to_tabs_no_overlap() else: # print("No redundant") return self.to_tabs_no_redundant() def find_strong_no_overlap_domains(self, parent_domain, already_resolved): """ Find all no strong overlap domains with maximum length 1) Resolve overlapping domains that overlap for less than 0.99% of their length to no strong overlap domains No strong overlap: |-----"--|-----" Strong overlap: |----"--"--| 2) Find enveloppe domains 3) From the rest of the domains, find the one with maximum length Parameters ---------- parent_domain : str anchor domain to start overlapping search candidate_overlap_domains : list of str list of overlapping domains Returns ------- strong_overlap_domains, no_strong_overlap_domains lists of strong overlapping domains (resolved), no strong overlapping (not (yet) resolved) """ envelopped_domains = self.domain_interval_tree.envelop( parent_domain.begin, parent_domain.end) overlapping_domains = self.domain_interval_tree.overlap( parent_domain.begin, parent_domain.end) candidate_domains = overlapping_domains - envelopped_domains - already_resolved strong_overlap_domains = set() no_strong_overlap_domains = set() for candidate_domain in list(candidate_domains): # As parent has the maximum length, there are two choices: # 1) candidate domain is strongly overlapping with the parent => add it to strong_overlap_domains (resolved) # 2) candidate domain is no strongly overlapping so => add it to no_strong_overlap_domains (not_resolved) candidate_domain_len = candidate_domain.end - candidate_domain.begin + 1 if candidate_domain.begin >= parent_domain.begin: # |---parent---| # |---child---| overlap_len = parent_domain.end - candidate_domain.begin + 1 else: # |---parent---| # |---child---| overlap_len = candidate_domain.end - parent_domain.begin + 1 if float(overlap_len ) / candidate_domain_len >= 0.8: # Strong overlap strong_overlap_domains.add(candidate_domain) assert candidate_domain.data.length <= parent_domain.data.length, "AssertionError: prot:{} candidate domain {} is longer than parent domain {}".format( self.uniprot_id, candidate_domain.data.evidence_db_id, parent_domain.data.evidence_db_id) else: # no strong overlap if candidate_domain.data.interpro_id == parent_domain.data.interpro_id: # if no strong overlap but the same interpro id take the longest one assert candidate_domain.data.length <= parent_domain.data.length, "AssertionError: prot:{} candidate domain {} is longer than parent domain {}".format( self.uniprot_id, candidate_domain.data.evidence_db_id, parent_domain.data.evidence_db_id) strong_overlap_domains.add(candidate_domain) else: no_strong_overlap_domains.add(candidate_domain) strong_overlap_domains.update( envelopped_domains ) # add envelopped domains to strong_overlap domains return strong_overlap_domains, no_strong_overlap_domains def find_no_redundant_domains(self, parent_domain, already_resolved): """ Find no redundant domains Parameters ---------- parent_domain : str anchor domain to start overlapping search already_resolved : set of str set of already resolved for redundancy domains Returns ------- """ overlapping_domains = self.domain_interval_tree.overlap( parent_domain.begin, parent_domain.end) candidate_domains = overlapping_domains - already_resolved redundant_domains = set() no_redundant_domains = set() for candidate_domain in list(candidate_domains): # As parent has the maximum length, there are two choices: # 1) candidate domain has the same interpro id => add it to redundant (resolved) # 2) candidate domain has not the same interpro id => add it to no redundant (not_resolved) if candidate_domain.data.interpro_id == parent_domain.data.interpro_id: redundant_domains.add(candidate_domain) else: no_redundant_domains.add(candidate_domain) return redundant_domains, no_redundant_domains def find_no_redundant_max_len(self): """ Find all domains that are not redundant (having unique interpro id) and are maximally long Parameters ---------- Returns ------- list of IntervalTree.node list of IntervalTree nodes as the no redundant maximum length domains """ resolved = set() domains_no_redundant_max = [] domains_len_srt = [domain for domain in self.domain_interval_tree] domains_len_srt.sort(key=lambda dom_node: dom_node.data.length, reverse=True) for domain_node in domains_len_srt: if domain_node not in resolved: redundant_domains, no_redundant_domains = self.find_no_redundant_domains( domain_node, resolved) domains_no_redundant_max.append(domain_node) resolved.update(redundant_domains) return domains_no_redundant_max def find_no_overlap_max_len(self): """ Find all domains that are not overlapping and are maximally long Parameters ---------- Returns ------- list of IntervalTree.node list of not overlapping maximum length domains """ resolved = set() domains_no_overlap_max = [] domains_len_srt = [domain for domain in self.domain_interval_tree] domains_len_srt.sort(key=lambda dom_node: dom_node.data.length, reverse=True) """ Idea: After sorting the domains by length in descending order, then pick each domain and check for envelopped domains -> resolved strong overlap domains -> resolved no strong overlap domains -> not resolved, the for loop will either add it as max no overlap or as resolved """ for domain_node in domains_len_srt: if domain_node not in resolved: strong_overlap_domains, strong_no_overlap_domains = self.find_strong_no_overlap_domains( domain_node, resolved) domains_no_overlap_max.append(domain_node.data) resolved.update(strong_overlap_domains) return domains_no_overlap_max def construct_gap_hitline(self, gap_start, gap_stop): """ Construct GAP domain tabular line Parameters ---------- gap_start : int GAP start position in protein amino sequence gap_stop : int GAP end position in protein amino sequence Returns ------- str GAP domain tabular line """ return "\t".join([ self.uniprot_id, "GAP", "gap", "gap_no_evid", str(gap_start), str(gap_stop) ]) def add_gaps_no_redundant(self, domains_srt): """ Add GAP domains in no redundant domain annotations Parameters ---------- domains_srt : list of Domain domains sorted per start/end position Returns ------- None """ start_gap = 1 previous_domain = None # interval tree node is_first_domain = True for domain_interval in domains_srt: if is_first_domain: # first domain if domain_interval.begin - start_gap + 1 > self.gap_min_size: # add start GAP assert domain_interval.begin > 1, "AssertionError: Start gap can be added if the very first domain is not starting at 1." self.domains_with_gaps.append( Domain( self.construct_gap_hitline( start_gap, domain_interval.begin - 1))) start_gap = domain_interval.end + 1 is_first_domain = False else: # check if the current domain and the previous are overlapping if yes then you can't add a gap # if no check the space between them overlap_domains = self.domain_interval_tree.overlap( domain_interval.begin, domain_interval.end) no_redundant_overlap_domains = overlap_domains.intersection( set(domains_srt)) if previous_domain not in no_redundant_overlap_domains: # not overlapping domains => check for space to add a GAP if domain_interval.begin - start_gap + 1 > self.gap_min_size: # add middle GAP self.domains_with_gaps.append( Domain( self.construct_gap_hitline( start_gap, domain_interval.begin - 1))) # adding gap or no append current domain interval and update start_gap self.domains_with_gaps.append(domain_interval.data) start_gap = domain_interval.end + 1 previous_domain = domain_interval # To check for end GAP, you should get the maximum end_pos of non redundant domain max_end_pos = max([dom.end for dom in domains_srt]) max_end_pos = max_end_pos + 1 if self.length - max_end_pos + 1 > self.gap_min_size: self.domains_with_gaps.append( Domain(self.construct_gap_hitline(start_gap, self.length))) def add_gaps(self, domains_srt): """ Add gaps in domain annotations Parameters ---------- domains_srt : list of Domain domain sorted per start/end position Returns ------- None """ start_gap = 1 for domain in domains_srt: # check for GAP in the start and middle of the protein # |--- --- protein --- ---| # |--dom1--| |--dom2--| # |GAP| if domain.start_pos - start_gap + 1 > self.gap_min_size: self.domains_with_gaps.append( Domain( self.construct_gap_hitline(start_gap, domain.start_pos))) start_gap = domain.end_pos + 1 self.domains_with_gaps.append(domain) # check for gap in the end of the protein seq # |--- --- protein --- ---| # |--dom1--| |--dom2--| # |GAP| if self.length - domain.end_pos + 1 > self.gap_min_size: self.domains_with_gaps.append( Domain( self.construct_gap_hitline(domain.end_pos + 1, self.length))) def to_tabs_no_redundant(self): """ Convert tabular info for protein in no redundant domain annotations (tabular output as well) Parameters ---------- Returns ------- str no redundant domain tabular output line """ # find no redundant domains with maximum length domains_no_redundant_max_len = self.find_no_redundant_max_len() # sort by start position domains_no_redundant_max_len.sort(key=lambda domain: domain.begin, reverse=False) if self.with_gap: self.add_gaps_no_redundant(domains_no_redundant_max_len) self.domains_with_gaps.sort( key=lambda domain: domain.start_pos, reverse=False) # sort by start position domains_no_redundant = " ".join( [domain.interpro_id for domain in self.domains_with_gaps]) domains_evidence_db_ids = " ".join( [domain.evidence_db_id for domain in self.domains_with_gaps]) else: domains_no_redundant = " ".join([ domain.data.interpro_id for domain in domains_no_redundant_max_len ]) domains_evidence_db_ids = " ".join([ domain.data.evidence_db_id for domain in domains_no_redundant_max_len ]) return self.uniprot_id + "\t" + domains_no_redundant + "\t" + domains_evidence_db_ids + "\n" def to_tabs_no_overlap(self): """ Convert tabular info for protein in no overlapping domain annotations (tabular output as well) Parameters ---------- Returns ------- str no overlapping domain tabular output line """ # find non overlaping domains with maximum length domains_no_overlap_max_len = self.find_no_overlap_max_len() # sort by start position domains_no_overlap_max_len.sort(key=lambda domain: domain.start_pos, reverse=False) if self.with_gap: self.add_gaps(domains_no_overlap_max_len) domains_no_overlap = " ".join( [domain.interpro_id for domain in self.domains_with_gaps]) domains_evidence_db_ids = " ".join( [domain.evidence_db_id for domain in self.domains_with_gaps]) else: domains_no_overlap = " ".join( [domain.interpro_id for domain in domains_no_overlap_max_len]) domains_evidence_db_ids = " ".join([ domain.evidence_db_id for domain in domains_no_overlap_max_len ]) return self.uniprot_id + "\t" + domains_no_overlap + "\t" + domains_evidence_db_ids + "\n" def to_tabs_overlap(self): """ Convert tabular info for protein in overlapping domain annotations (tabular output as well) Parameters ---------- Returns ------- str overlapping domain tabular output line """ # for gaps you shall give a list out of the sorted dictionary sorted(self.domains) if self.with_gap: self.add_gaps([ self.domains[start_stop] for start_stop in sorted(self.domains.keys()) ]) domains_overlap = " ".join( [domain.interpro_id for domain in self.domains_with_gaps]) domains_evid_db_ids = " ".join( [domain.evidence_db_id for domain in self.domains_with_gaps]) else: domains_overlap = " ".join([ self.domains[start_stop].interpro_id for start_stop in self.domains ]) domains_evid_db_ids = " ".join([ self.domains[start_stop].evidence_db_id for start_stop in self.domains ]) return self.uniprot_id + "\t" + domains_overlap + "\t" + domains_evid_db_ids + "\n"