def getAED(query, reference): ''' function to calcuate annotation edit distance between two mRNA transcript coordinates AED = 1 - (SN + SP / 2) SN = fraction of ref predicted SP = fraction prediction overlapping the ref ''' def _length(listTup): len = 0 for i in listTup: l = abs(i[0] - i[1]) len += l return len #check if identical if query == reference: return '0.000' #make sure sorted rLen = _length(reference) refInterlap = InterLap(reference) RefPred = 0 QueryOverlap = 0 qLen = 0 for exon in query: qLen += abs(exon[0] - exon[1]) if exon in refInterlap: #exon overlaps at least partially with reference hit = list(refInterlap.find(exon)) for h in hit: diff = np.subtract( exon, h) #will return array of exon minus hit at each pos if diff[0] <= 0 and diff[ 1] >= 0: #then query exon covers ref exon cov = abs(h[0] - h[1]) QueryOverlap += cov elif diff[0] <= 0 and diff[ 1] < 0: # means query partial covers ref cov = abs(h[0] - exon[1]) QueryOverlap += cov elif diff[0] > 0 and diff[ 1] >= 0: #means query partial covers ref cov = abs(exon[0] - h[1]) QueryOverlap += cov elif diff[0] > 0 and diff[1] < 1: cov = abs(exon[0] - exon[1]) QueryOverlap += cov #calculate AED if qLen > 0 and rLen > 0: SP = QueryOverlap / float(qLen) SN = QueryOverlap / float(rLen) AED = 1 - ((SN + SP) / 2) else: AED = 0.000 return '{:.3f}'.format(AED)
def _find_contacts(self, mob_traces): """Find contacts in a given list `mob_traces` of `Visit`s""" # Group mobility traces by site mob_traces_at_site = defaultdict(list) for v in mob_traces: mob_traces_at_site[v.site].append(v) # dict of dict of list of contacts: # i.e. contacts[i][j][k] = "k-th contact from i to j" contacts = {i: defaultdict(InterLap) for i in range(self.num_people)} # For each site s for s in range(self.num_sites): if self.verbose: print('Checking site ' + str(s + 1) + '/' + str(self.num_sites), end='\r') if len(mob_traces_at_site[s]) == 0: continue # Init the interval overlap matcher inter = InterLap() inter.update(mob_traces_at_site[s]) # Match contacts for v in mob_traces_at_site[s]: v_time = (v.t_from, v.t_to) for vo in list(inter.find(other=v_time)): # Ignore contacts with same individual if v.indiv == vo.indiv: continue # Compute contact time c_t_from = max(v.t_from, vo.t_from) c_t_to = min(v.t_to, vo.t_to_shifted) if c_t_to > c_t_from: # Set contact tuple c = Contact(t_from=c_t_from, t_to=c_t_to, indiv_i=v.indiv, indiv_j=vo.indiv, id_tup=(v.id, vo.id), site=s, duration=c_t_to - c_t_from) # Add it to interlap contacts[v.indiv][vo.indiv].update([c]) return contacts
class UpperBoundCasesBetaMultiplier(BetaMultiplierMeasure): def __init__(self, t_window, beta_multiplier, max_pos_tests_per_week_per_100k, intervention_times=None, init_active=False): """ Additional parameters: ---------------------- max_pos_test_per_week : int If the number of positive tests per week exceeds this number the measure becomes active intervention_times : list of floats List of points in time at which interventions can be changed. If 'None' interventions can be changed at any time init_active : bool If true measure is active in the first week of the simulation when there are no test counts yet """ super().__init__(t_window, beta_multiplier) self.max_pos_tests_per_week_per_100k = max_pos_tests_per_week_per_100k self.intervention_times = intervention_times self.intervention_history = InterLap() if init_active: self.intervention_history.update([(t_window.left, t_window.left + 7 * 24 - EPS, True)]) def init_run(self, n_people, n_visits): self.scaled_test_threshold = self.max_pos_tests_per_week_per_100k / 100000 * n_people self._is_init = True @enforce_init_run def _is_measure_active(self, t, t_pos_tests): # If measures can only become active at 'intervention_times' if self.intervention_times is not None: # Find largest 'time' in intervention_times s.t. t > time intervention_times = np.asarray(self.intervention_times) idx = np.where(t - intervention_times > 0, t - intervention_times, np.inf).argmin() t = intervention_times[idx] t_in_history = list(self.intervention_history.find((t, t))) if t_in_history: is_active = t_in_history[0][2] else: is_active = self._are_cases_above_threshold(t, t_pos_tests) if is_active: self.intervention_history.update([(t, t+7*24 - EPS, True)]) return is_active @enforce_init_run def _are_cases_above_threshold(self, t, t_pos_tests): # Count positive tests in last 7 days from last intervention time tmin = t - 7 * 24 num_pos_tests = np.sum(np.greater(t_pos_tests, tmin) * np.less(t_pos_tests, t)) is_above_threshold = num_pos_tests > self.scaled_test_threshold return is_above_threshold @enforce_init_run def beta_factor(self, *, typ, t, t_pos_tests): """Returns the multiplicative factor for site type `typ` at time `t`. The factor is one if `t` is not in the active time window of the measure. """ if not self._in_window(t): return 1.0 is_measure_active = self._is_measure_active(t, t_pos_tests) return self.beta_multiplier[typ] if is_measure_active else 1.0
class UpperBoundCasesSocialDistancing(SocialDistancingForAllMeasure): def __init__(self, t_window, p_stay_home, max_pos_tests_per_week_per_100k, intervention_times=None, init_active=False): """ Additional parameters: ---------------------- max_pos_test_per_week : int If the number of positive tests per week exceeds this number the measure becomes active intervention_times : list of floats List of points in time at which measures can become active. If 'None' measures can be changed at any time """ super().__init__(t_window, p_stay_home) self.max_pos_tests_per_week_per_100k = max_pos_tests_per_week_per_100k self.intervention_times = intervention_times self.intervention_history = InterLap() if init_active: self.intervention_history.update([(t_window.left, t_window.left + 7 * 24 - EPS, True)]) def init_run(self, n_people, n_visits): super().init_run(n_people, n_visits) self.scaled_test_threshold = self.max_pos_tests_per_week_per_100k / 100000 * n_people def _is_measure_active(self, t, t_pos_tests): # If measures can only become active at 'intervention_times' if self.intervention_times is not None: # Find largest 'time' in intervention_times s.t. t > time intervention_times = np.asarray(self.intervention_times) idx = np.where(t - intervention_times > 0, t - intervention_times, np.inf).argmin() t = intervention_times[idx] t_in_history = list(self.intervention_history.find((t, t))) if t_in_history: is_active = t_in_history[0][2] else: is_active = self._are_cases_above_threshold(t, t_pos_tests) if is_active: self.intervention_history.update([(t, t + 7 * 24 - EPS, True)]) return is_active def _are_cases_above_threshold(self, t, t_pos_tests): # Count positive tests in last 7 days from last intervention time tmin = t - 7 * 24 num_pos_tests = np.sum(np.greater(t_pos_tests, tmin) * np.less(t_pos_tests, t)) is_above_threshold = num_pos_tests > self.scaled_test_threshold return is_above_threshold @enforce_init_run def is_contained(self, *, j, j_visit_id, t, t_pos_tests): """Indicate if individual `j` respects measure for visit `j_visit_id` """ if not self._in_window(t): return False is_home_now = self.bernoulli_stay_home[j, j_visit_id] return is_home_now and self._is_measure_active(t, t_pos_tests) @enforce_init_run def is_contained_prob(self, *, j, t, t_pos_tests): """Returns probability of containment for individual `j` at time `t` """ if not self._in_window(t): return 0.0 if self._is_measure_active(t, t_pos_tests): return self.p_stay_home return 0.0
def _find_mob_trace_overlaps(self, sites, mob_traces_at_site, infector_mob_traces_at_site, tmin, for_all_individuals): # decide way of storing depending on way the function is used (all or individual) # FIXME: this could be done in a cleaner way by calling this function several times in `_find_contacts` if for_all_individuals: # dict of dict of list of contacts: # i.e. contacts[i][j][k] = "k-th contact from i to j" contacts = { i: defaultdict(InterLap) for i in range(self.num_people) } else: contacts = InterLap() if self.verbose and for_all_individuals: print() # otherwise jupyter notebook looks ugly for s in sites: if self.verbose and for_all_individuals: print('Checking site ' + str(s + 1) + '/' + str(len(sites)), end='\r') if len(mob_traces_at_site[s]) == 0: continue # Init the interval overlap matcher inter = InterLap() inter.update(mob_traces_at_site[s]) # Match contacts # Iterate over each visit of the infector at site s for v_inf in infector_mob_traces_at_site[s]: # Skip if delta-contact ends before `tmin` if v_inf.t_to_shifted > tmin: v_time = (v_inf.t_from, v_inf.t_to_shifted) # Find any othe person that had overlap with this visit for v in list(inter.find(other=v_time)): # Ignore contacts with same individual if v.indiv == v_inf.indiv: continue # Compute contact time c_t_from = max(v.t_from, v_inf.t_from) c_t_to = min(v.t_to, v_inf.t_to_shifted) if c_t_to > c_t_from and c_t_to > tmin: # Init contact tuple # Note 1: Contact always considers delta overlap for `indiv_j` # (i.e. for `indiv_j` being the infector) # Note 2: Contact contains the delta-extended visit of `indiv_j` # (i.e. there is a `Contact` even when `indiv_j` never overlapped physically with `indiv_i`) # (i.e. need to adjust for that in dY_i integral) c = Contact(t_from=c_t_from, t_to=c_t_to, indiv_i=v.indiv, indiv_j=v_inf.indiv, id_tup=(v.id, v_inf.id), site=s, duration=c_t_to - c_t_from) # Add it to interlap if for_all_individuals: # Dictionary of all contacts contacts[v.indiv][v_inf.indiv].update([c]) else: # All contacts of (infector) 'indiv' only contacts.update([c]) return contacts
def findUTRs(cds, mrna, strand): ''' take list of list of CDS coordiantes and compare to list of list of mRNA coordinates to determine if 5 prime or 3 prime UTR exist ''' #supporting multiple transcripts, however, they are already matched up and sorted UTRs = [] for i in range(0, len(cds)): Fiveprime = False Threeprime = False refInterlap = InterLap(mrna[i]) if strand == '+': #look at first CDS for 5 prime and last CDS for 3 prime if cds[i][ 0] in refInterlap: #means it overlaps with mrNA (which it obviously should) hit = list(refInterlap.find(cds[i][0]))[0] loc = mrna[i].index( hit ) #if first exon, then compare, if not first then there is 5prime UTR if loc == 0: diff = np.subtract( cds[i][0], hit) #will return array of exon minus hit at each pos if diff[0] > 0: Fiveprime = True else: Fiveprime = True #check for 3 prime UTR if cds[i][-1] in refInterlap: hit = list(refInterlap.find(cds[i][-1]))[0] loc = mrna[i].index(hit) if len(mrna[i]) == loc + 1: diff = np.subtract( cds[i][-1], hit) #will return array of exon minus hit at each pos if diff[1] < 0: Threeprime = True else: Threeprime = True else: if cds[i][ 0] in refInterlap: #means it overlaps with mrNA (which it obviously should) hit = list(refInterlap.find(cds[i][0]))[0] loc = mrna[i].index( hit ) #if first exon, then compare, if not first then there is 5prime UTR if loc == 0: diff = np.subtract( cds[i][0], hit) #will return array of exon minus hit at each pos if diff[1] < 0: Fiveprime = True else: Fiveprime = True #check for 3 prime UTR if cds[i][-1] in refInterlap: hit = list(refInterlap.find(cds[i][-1]))[0] loc = mrna[i].index(hit) if len(mrna[i]) == loc + 1: diff = np.subtract( cds[i][-1], hit) #will return array of exon minus hit at each pos if diff[0] > 0: Threeprime = True else: Threeprime = True UTRs.append((Fiveprime, Threeprime)) return UTRs
br = [] rr = [] bc = 0 #count bouke rc = 0 #count raoul for x in files: inter = InterLap() b = tgt.read_textgrid(mypath + x + "B.TextGrid").tiers[1] r = tgt.read_textgrid(mypath + x + "R.TextGrid").tiers[1] bc += len(b) rc += len(r) inter.add([convert_to_float(i) for i in r]) tot_overlaps = set() for i in b: interval = convert_to_float(i) overlaps = list(inter.find(interval)) #print(interval[2]) if (len(overlaps) > 0): overlaps = [tuple(x) for x in overlaps] for o in overlaps: tot_overlaps.add(o) rr.append(most_common([o[2] for o in overlaps])) br.append(interval[2]) else: rr.append("*") br.append(interval[2]) #check for unmatched from Raoul
def read_exons(gtf, chrom, cutoff, coverage_array, exclude): genes = defaultdict(IntervalSet) splitters = defaultdict(IntervalSet) interlaps = [] split_iv = InterLap() # preempt any bugs by checking that we are getting a particular chrom assert gtf[0] == "|", ( "expecting a tabix query so we can handle chroms correctly") #f1 = open("selfchaincut.txt","a") #f2 = open("segdupscut.txt","a") #f3 = open("coveragecut.txt","a") for bed in exclude: # expecting a tabix query so we can handle chroms correctly a = "|tabix {bed} {chrom}".format(chrom=chrom, bed=bed) # any file that gets sent in will be used to split regions (just like # low-coverage). For example, we split on self-chains as well. #TODO: comment this block if you don't want any filtering by self-chains or segdups for toks in ( x.strip().split("\t") for x in ts.nopen(a) ): # adds self chains and segdups to splitters list, so that exons can be split, and they are removed from CCRs s, e = int(toks[1]), int(toks[2]) split_iv.add((s, e)) #if len(toks) > 3: # f1.write("\t".join(toks)+"\n") # self chain #else: # f2.write("\t".join(toks)+"\n") # segdups for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"): if toks[2] not in ("CDS", "stop_codon") or toks[1] not in ("protein_coding"): continue #if toks[0] != "1": break start, end = map(int, toks[3:5]) gene = toks[8].split('gene_name "')[1].split('"', 1)[0] assert start <= end, toks key = toks[0], gene #cutoff = 0.3 # find sections of exon under certain coverage. #TODO: comment this if we don't want coverage cutoff filtering if coverage_array[start - 1:end].min( ) < cutoff: # doesn't bother to run these operations if there is not one bp below the cutoff #splitters[key].add([(start - 1, end)]) #this takes out the whole exon for one section of poor coverage a = coverage_array[start - 1:end] #print str(start-1),end,a is_under, locs = False, [] # generates "locs" for each exon" if a[0] < cutoff: locs.append([start - 1]) is_under = True # so you can initialize is_under for pos, v in enumerate( a[1:], start=start ): #enumerates positions in the coverage array starting at the beginning of the exon if v < cutoff: if not is_under: is_under = True locs.append( [pos - 1] ) #start, coverage is in bed format, so pos-1 is necessary, since splitters are open left and right side else: if is_under: is_under = False locs[-1].append(pos) #end if is_under: locs[-1].append( end ) # in this case would end splitter at the end of the exon splitters[key].add(map(tuple, locs)) #for i in locs: # f3.write(chrom+"\t"+"\t".join(map(str,i))+"\n") for s, e in split_iv.find((start - 1, end)): splitters[key].add([(s, e)]) genes[key].add( [(start - 1, end)] ) # converts GTF exon coordinates to BED format (subtracts 1 from exon start) # sort by start so we can do binary search. genes = dict((k, sorted(v._vals)) for k, v in genes.iteritems()) #ends = dict((k, sorted(v)) for k, v in ends.iteritems()) splits, starts, ends = {}, {}, {} splitters = dict(splitters) for chrom_gene, sends in genes.iteritems(): starts[chrom_gene] = [s[0] for s in sends] ends[chrom_gene] = [s[1] for s in sends] if chrom_gene in splitters: splits[chrom_gene] = splitters[chrom_gene]._vals return starts, ends, splits