Ejemplo n.º 1
0
def getAED(query, reference):
    '''
    function to calcuate annotation edit distance between two mRNA transcript coordinates
    AED = 1 - (SN + SP / 2)
    SN = fraction of ref predicted
    SP = fraction prediction overlapping the ref
    '''
    def _length(listTup):
        len = 0
        for i in listTup:
            l = abs(i[0] - i[1])
            len += l
        return len

    #check if identical
    if query == reference:
        return '0.000'
    #make sure sorted
    rLen = _length(reference)
    refInterlap = InterLap(reference)
    RefPred = 0
    QueryOverlap = 0
    qLen = 0
    for exon in query:
        qLen += abs(exon[0] - exon[1])
        if exon in refInterlap:  #exon overlaps at least partially with reference
            hit = list(refInterlap.find(exon))
            for h in hit:
                diff = np.subtract(
                    exon, h)  #will return array of exon minus hit at each pos
                if diff[0] <= 0 and diff[
                        1] >= 0:  #then query exon covers ref exon
                    cov = abs(h[0] - h[1])
                    QueryOverlap += cov
                elif diff[0] <= 0 and diff[
                        1] < 0:  # means query partial covers ref
                    cov = abs(h[0] - exon[1])
                    QueryOverlap += cov
                elif diff[0] > 0 and diff[
                        1] >= 0:  #means query partial covers ref
                    cov = abs(exon[0] - h[1])
                    QueryOverlap += cov
                elif diff[0] > 0 and diff[1] < 1:
                    cov = abs(exon[0] - exon[1])
                    QueryOverlap += cov
    #calculate AED
    if qLen > 0 and rLen > 0:
        SP = QueryOverlap / float(qLen)
        SN = QueryOverlap / float(rLen)
        AED = 1 - ((SN + SP) / 2)
    else:
        AED = 0.000
    return '{:.3f}'.format(AED)
Ejemplo n.º 2
0
    def _find_contacts(self, mob_traces):
        """Find contacts in a given list `mob_traces` of `Visit`s"""
        # Group mobility traces by site
        mob_traces_at_site = defaultdict(list)
        for v in mob_traces:
            mob_traces_at_site[v.site].append(v)

        # dict of dict of list of contacts:
        # i.e. contacts[i][j][k] = "k-th contact from i to j"
        contacts = {i: defaultdict(InterLap) for i in range(self.num_people)}

        # For each site s
        for s in range(self.num_sites):
            if self.verbose:
                print('Checking site ' + str(s + 1) + '/' +
                      str(self.num_sites),
                      end='\r')
            if len(mob_traces_at_site[s]) == 0:
                continue

            # Init the interval overlap matcher
            inter = InterLap()
            inter.update(mob_traces_at_site[s])
            # Match contacts
            for v in mob_traces_at_site[s]:
                v_time = (v.t_from, v.t_to)
                for vo in list(inter.find(other=v_time)):
                    # Ignore contacts with same individual
                    if v.indiv == vo.indiv:
                        continue
                    # Compute contact time
                    c_t_from = max(v.t_from, vo.t_from)
                    c_t_to = min(v.t_to, vo.t_to_shifted)
                    if c_t_to > c_t_from:
                        # Set contact tuple
                        c = Contact(t_from=c_t_from,
                                    t_to=c_t_to,
                                    indiv_i=v.indiv,
                                    indiv_j=vo.indiv,
                                    id_tup=(v.id, vo.id),
                                    site=s,
                                    duration=c_t_to - c_t_from)
                        # Add it to interlap
                        contacts[v.indiv][vo.indiv].update([c])

        return contacts
Ejemplo n.º 3
0
class UpperBoundCasesBetaMultiplier(BetaMultiplierMeasure):

    def __init__(self, t_window, beta_multiplier, max_pos_tests_per_week_per_100k, intervention_times=None, init_active=False):
        """
        Additional parameters:
        ----------------------
        max_pos_test_per_week : int
            If the number of positive tests per week exceeds this number the measure becomes active
        intervention_times : list of floats
            List of points in time at which interventions can be changed. If 'None' interventions can be changed at any time
        init_active : bool
            If true measure is active in the first week of the simulation when there are no test counts yet
        """

        super().__init__(t_window, beta_multiplier)
        self.max_pos_tests_per_week_per_100k = max_pos_tests_per_week_per_100k
        self.intervention_times = intervention_times
        self.intervention_history = InterLap()
        if init_active:
            self.intervention_history.update([(t_window.left, t_window.left + 7 * 24 - EPS, True)])

    def init_run(self, n_people, n_visits):
        self.scaled_test_threshold = self.max_pos_tests_per_week_per_100k / 100000 * n_people
        self._is_init = True

    @enforce_init_run
    def _is_measure_active(self, t, t_pos_tests):
        # If measures can only become active at 'intervention_times'
        if self.intervention_times is not None:
            # Find largest 'time' in intervention_times s.t. t > time
            intervention_times = np.asarray(self.intervention_times)
            idx = np.where(t - intervention_times > 0, t - intervention_times, np.inf).argmin()
            t = intervention_times[idx]

        t_in_history = list(self.intervention_history.find((t, t)))
        if t_in_history:
            is_active = t_in_history[0][2]
        else:
            is_active = self._are_cases_above_threshold(t, t_pos_tests)
            if is_active:
                self.intervention_history.update([(t, t+7*24 - EPS, True)])
        return is_active

    @enforce_init_run
    def _are_cases_above_threshold(self, t, t_pos_tests):
        # Count positive tests in last 7 days from last intervention time
        tmin = t - 7 * 24
        num_pos_tests = np.sum(np.greater(t_pos_tests, tmin) * np.less(t_pos_tests, t))
        is_above_threshold = num_pos_tests > self.scaled_test_threshold
        return is_above_threshold

    @enforce_init_run
    def beta_factor(self, *, typ, t, t_pos_tests):
        """Returns the multiplicative factor for site type `typ` at time `t`. The
        factor is one if `t` is not in the active time window of the measure.
        """
        if not self._in_window(t):
            return 1.0

        is_measure_active = self._is_measure_active(t, t_pos_tests)
        return self.beta_multiplier[typ] if is_measure_active else 1.0
Ejemplo n.º 4
0
class UpperBoundCasesSocialDistancing(SocialDistancingForAllMeasure):

    def __init__(self, t_window, p_stay_home, max_pos_tests_per_week_per_100k, intervention_times=None, init_active=False):
        """
        Additional parameters:
        ----------------------
        max_pos_test_per_week : int
            If the number of positive tests per week exceeds this number the measure becomes active
        intervention_times : list of floats
            List of points in time at which measures can become active. If 'None' measures can be changed at any time
        """

        super().__init__(t_window, p_stay_home)
        self.max_pos_tests_per_week_per_100k = max_pos_tests_per_week_per_100k
        self.intervention_times = intervention_times
        self.intervention_history = InterLap()
        if init_active:
            self.intervention_history.update([(t_window.left, t_window.left + 7 * 24 - EPS, True)])

    def init_run(self, n_people, n_visits):
        super().init_run(n_people, n_visits)
        self.scaled_test_threshold = self.max_pos_tests_per_week_per_100k / 100000 * n_people

    def _is_measure_active(self, t, t_pos_tests):
        # If measures can only become active at 'intervention_times'
        if self.intervention_times is not None:
            # Find largest 'time' in intervention_times s.t. t > time
            intervention_times = np.asarray(self.intervention_times)
            idx = np.where(t - intervention_times > 0, t - intervention_times, np.inf).argmin()
            t = intervention_times[idx]

        t_in_history = list(self.intervention_history.find((t, t)))
        if t_in_history:
            is_active = t_in_history[0][2]
        else:
            is_active = self._are_cases_above_threshold(t, t_pos_tests)
            if is_active:
                self.intervention_history.update([(t, t + 7 * 24 - EPS, True)])
        return is_active

    def _are_cases_above_threshold(self, t, t_pos_tests):
        # Count positive tests in last 7 days from last intervention time
        tmin = t - 7 * 24
        num_pos_tests = np.sum(np.greater(t_pos_tests, tmin) * np.less(t_pos_tests, t))
        is_above_threshold = num_pos_tests > self.scaled_test_threshold
        return is_above_threshold

    @enforce_init_run
    def is_contained(self, *, j, j_visit_id, t, t_pos_tests):
        """Indicate if individual `j` respects measure for visit `j_visit_id`
        """
        if not self._in_window(t):
            return False

        is_home_now = self.bernoulli_stay_home[j, j_visit_id]
        return is_home_now and self._is_measure_active(t, t_pos_tests)

    @enforce_init_run
    def is_contained_prob(self, *, j, t, t_pos_tests):
        """Returns probability of containment for individual `j` at time `t`
        """
        if not self._in_window(t):
            return 0.0

        if self._is_measure_active(t, t_pos_tests):
            return self.p_stay_home
        return 0.0
Ejemplo n.º 5
0
    def _find_mob_trace_overlaps(self, sites, mob_traces_at_site,
                                 infector_mob_traces_at_site, tmin,
                                 for_all_individuals):

        # decide way of storing depending on way the function is used (all or individual)
        # FIXME: this could be done in a cleaner way by calling this function several times in `_find_contacts`
        if for_all_individuals:
            # dict of dict of list of contacts:
            # i.e. contacts[i][j][k] = "k-th contact from i to j"
            contacts = {
                i: defaultdict(InterLap)
                for i in range(self.num_people)
            }
        else:
            contacts = InterLap()

        if self.verbose and for_all_individuals:
            print()  # otherwise jupyter notebook looks ugly

        for s in sites:
            if self.verbose and for_all_individuals:
                print('Checking site ' + str(s + 1) + '/' + str(len(sites)),
                      end='\r')
            if len(mob_traces_at_site[s]) == 0:
                continue

            # Init the interval overlap matcher
            inter = InterLap()
            inter.update(mob_traces_at_site[s])

            # Match contacts
            # Iterate over each visit of the infector at site s
            for v_inf in infector_mob_traces_at_site[s]:

                # Skip if delta-contact ends before `tmin`
                if v_inf.t_to_shifted > tmin:

                    v_time = (v_inf.t_from, v_inf.t_to_shifted)

                    # Find any othe person that had overlap with this visit
                    for v in list(inter.find(other=v_time)):

                        # Ignore contacts with same individual
                        if v.indiv == v_inf.indiv:
                            continue

                        # Compute contact time
                        c_t_from = max(v.t_from, v_inf.t_from)
                        c_t_to = min(v.t_to, v_inf.t_to_shifted)
                        if c_t_to > c_t_from and c_t_to > tmin:

                            # Init contact tuple
                            # Note 1: Contact always considers delta overlap for `indiv_j`
                            # (i.e. for `indiv_j` being the infector)
                            # Note 2: Contact contains the delta-extended visit of `indiv_j`
                            # (i.e. there is a `Contact` even when `indiv_j` never overlapped physically with `indiv_i`)
                            # (i.e. need to adjust for that in dY_i integral)
                            c = Contact(t_from=c_t_from,
                                        t_to=c_t_to,
                                        indiv_i=v.indiv,
                                        indiv_j=v_inf.indiv,
                                        id_tup=(v.id, v_inf.id),
                                        site=s,
                                        duration=c_t_to - c_t_from)

                            # Add it to interlap
                            if for_all_individuals:
                                # Dictionary of all contacts
                                contacts[v.indiv][v_inf.indiv].update([c])
                            else:
                                # All contacts of (infector) 'indiv' only
                                contacts.update([c])
        return contacts
Ejemplo n.º 6
0
def findUTRs(cds, mrna, strand):
    '''
    take list of list of CDS coordiantes and compare to list of list of mRNA coordinates to
    determine if 5 prime or 3 prime UTR exist
    '''
    #supporting multiple transcripts, however, they are already matched up and sorted
    UTRs = []
    for i in range(0, len(cds)):
        Fiveprime = False
        Threeprime = False
        refInterlap = InterLap(mrna[i])
        if strand == '+':  #look at first CDS for 5 prime and last CDS for 3 prime
            if cds[i][
                    0] in refInterlap:  #means it overlaps with mrNA (which it obviously should)
                hit = list(refInterlap.find(cds[i][0]))[0]
                loc = mrna[i].index(
                    hit
                )  #if first exon, then compare, if not first then there is 5prime UTR
                if loc == 0:
                    diff = np.subtract(
                        cds[i][0],
                        hit)  #will return array of exon minus hit at each pos
                    if diff[0] > 0:
                        Fiveprime = True
                else:
                    Fiveprime = True
            #check for 3 prime UTR
            if cds[i][-1] in refInterlap:
                hit = list(refInterlap.find(cds[i][-1]))[0]
                loc = mrna[i].index(hit)
                if len(mrna[i]) == loc + 1:
                    diff = np.subtract(
                        cds[i][-1],
                        hit)  #will return array of exon minus hit at each pos
                    if diff[1] < 0:
                        Threeprime = True
                else:
                    Threeprime = True
        else:
            if cds[i][
                    0] in refInterlap:  #means it overlaps with mrNA (which it obviously should)
                hit = list(refInterlap.find(cds[i][0]))[0]
                loc = mrna[i].index(
                    hit
                )  #if first exon, then compare, if not first then there is 5prime UTR
                if loc == 0:
                    diff = np.subtract(
                        cds[i][0],
                        hit)  #will return array of exon minus hit at each pos
                    if diff[1] < 0:
                        Fiveprime = True
                else:
                    Fiveprime = True
            #check for 3 prime UTR
            if cds[i][-1] in refInterlap:
                hit = list(refInterlap.find(cds[i][-1]))[0]
                loc = mrna[i].index(hit)
                if len(mrna[i]) == loc + 1:
                    diff = np.subtract(
                        cds[i][-1],
                        hit)  #will return array of exon minus hit at each pos
                    if diff[0] > 0:
                        Threeprime = True
                else:
                    Threeprime = True
        UTRs.append((Fiveprime, Threeprime))
    return UTRs
Ejemplo n.º 7
0
br = []
rr = []
bc = 0  #count bouke
rc = 0  #count raoul

for x in files:
    inter = InterLap()
    b = tgt.read_textgrid(mypath + x + "B.TextGrid").tiers[1]
    r = tgt.read_textgrid(mypath + x + "R.TextGrid").tiers[1]
    bc += len(b)
    rc += len(r)
    inter.add([convert_to_float(i) for i in r])
    tot_overlaps = set()
    for i in b:
        interval = convert_to_float(i)
        overlaps = list(inter.find(interval))

        #print(interval[2])

        if (len(overlaps) > 0):
            overlaps = [tuple(x) for x in overlaps]
            for o in overlaps:
                tot_overlaps.add(o)

            rr.append(most_common([o[2] for o in overlaps]))
            br.append(interval[2])
        else:
            rr.append("*")
            br.append(interval[2])

    #check for unmatched from Raoul
Ejemplo n.º 8
0
def read_exons(gtf, chrom, cutoff, coverage_array, exclude):
    genes = defaultdict(IntervalSet)
    splitters = defaultdict(IntervalSet)

    interlaps = []
    split_iv = InterLap()
    # preempt any bugs by checking that we are getting a particular chrom
    assert gtf[0] == "|", (
        "expecting a tabix query so we can handle chroms correctly")
    #f1 = open("selfchaincut.txt","a")
    #f2 = open("segdupscut.txt","a")
    #f3 = open("coveragecut.txt","a")
    for bed in exclude:
        # expecting a tabix query so we can handle chroms correctly
        a = "|tabix {bed} {chrom}".format(chrom=chrom, bed=bed)

        # any file that gets sent in will be used to split regions (just like
        # low-coverage). For example, we split on self-chains as well.
        #TODO: comment this block if you don't want any filtering by self-chains or segdups
        for toks in (
                x.strip().split("\t") for x in ts.nopen(a)
        ):  # adds self chains and segdups to splitters list, so that exons can be split, and they are removed from CCRs
            s, e = int(toks[1]), int(toks[2])
            split_iv.add((s, e))
            #if len(toks) > 3:
            #    f1.write("\t".join(toks)+"\n") # self chain
            #else:
            #    f2.write("\t".join(toks)+"\n") # segdups

    for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf)
                 if x[0] != "#"):
        if toks[2] not in ("CDS",
                           "stop_codon") or toks[1] not in ("protein_coding"):
            continue
        #if toks[0] != "1": break
        start, end = map(int, toks[3:5])
        gene = toks[8].split('gene_name "')[1].split('"', 1)[0]
        assert start <= end, toks
        key = toks[0], gene

        #cutoff = 0.3

        # find sections of exon under certain coverage.
        #TODO: comment this if we don't want coverage cutoff filtering
        if coverage_array[start - 1:end].min(
        ) < cutoff:  # doesn't bother to run these operations if there is not one bp below the cutoff
            #splitters[key].add([(start - 1, end)]) #this takes out the whole exon for one section of poor coverage
            a = coverage_array[start - 1:end]
            #print str(start-1),end,a
            is_under, locs = False, []  # generates "locs" for each exon"
            if a[0] < cutoff:
                locs.append([start - 1])
                is_under = True  # so you can initialize is_under
            for pos, v in enumerate(
                    a[1:], start=start
            ):  #enumerates positions in the coverage array starting at the beginning of the exon
                if v < cutoff:
                    if not is_under:
                        is_under = True
                        locs.append(
                            [pos - 1]
                        )  #start, coverage is in bed format, so pos-1 is necessary, since splitters are open left and right side
                else:
                    if is_under:
                        is_under = False
                        locs[-1].append(pos)  #end
            if is_under:
                locs[-1].append(
                    end
                )  # in this case would end splitter at the end of the exon
            splitters[key].add(map(tuple, locs))
            #for i in locs:
            #    f3.write(chrom+"\t"+"\t".join(map(str,i))+"\n")

        for s, e in split_iv.find((start - 1, end)):
            splitters[key].add([(s, e)])

        genes[key].add(
            [(start - 1, end)]
        )  # converts GTF exon coordinates to BED format (subtracts 1 from exon start)
    # sort by start so we can do binary search.
    genes = dict((k, sorted(v._vals)) for k, v in genes.iteritems())
    #ends = dict((k, sorted(v)) for k, v in ends.iteritems())
    splits, starts, ends = {}, {}, {}
    splitters = dict(splitters)
    for chrom_gene, sends in genes.iteritems():
        starts[chrom_gene] = [s[0] for s in sends]
        ends[chrom_gene] = [s[1] for s in sends]
        if chrom_gene in splitters:
            splits[chrom_gene] = splitters[chrom_gene]._vals

    return starts, ends, splits