return max(groups, key=_auxfun)[0]


print(files)
br = []
rr = []
bc = 0  #count bouke
rc = 0  #count raoul

for x in files:
    inter = InterLap()
    b = tgt.read_textgrid(mypath + x + "B.TextGrid").tiers[1]
    r = tgt.read_textgrid(mypath + x + "R.TextGrid").tiers[1]
    bc += len(b)
    rc += len(r)
    inter.add([convert_to_float(i) for i in r])
    tot_overlaps = set()
    for i in b:
        interval = convert_to_float(i)
        overlaps = list(inter.find(interval))

        #print(interval[2])

        if (len(overlaps) > 0):
            overlaps = [tuple(x) for x in overlaps]
            for o in overlaps:
                tot_overlaps.add(o)

            rr.append(most_common([o[2] for o in overlaps]))
            br.append(interval[2])
        else:
Beispiel #2
0
def read_exons(gtf, chrom, cutoff, coverage_array, exclude):
    genes = defaultdict(IntervalSet)
    splitters = defaultdict(IntervalSet)

    interlaps = []
    split_iv = InterLap()
    # preempt any bugs by checking that we are getting a particular chrom
    assert gtf[0] == "|", (
        "expecting a tabix query so we can handle chroms correctly")
    #f1 = open("selfchaincut.txt","a")
    #f2 = open("segdupscut.txt","a")
    #f3 = open("coveragecut.txt","a")
    for bed in exclude:
        # expecting a tabix query so we can handle chroms correctly
        a = "|tabix {bed} {chrom}".format(chrom=chrom, bed=bed)

        # any file that gets sent in will be used to split regions (just like
        # low-coverage). For example, we split on self-chains as well.
        #TODO: comment this block if you don't want any filtering by self-chains or segdups
        for toks in (
                x.strip().split("\t") for x in ts.nopen(a)
        ):  # adds self chains and segdups to splitters list, so that exons can be split, and they are removed from CCRs
            s, e = int(toks[1]), int(toks[2])
            split_iv.add((s, e))
            #if len(toks) > 3:
            #    f1.write("\t".join(toks)+"\n") # self chain
            #else:
            #    f2.write("\t".join(toks)+"\n") # segdups

    for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf)
                 if x[0] != "#"):
        if toks[2] not in ("CDS",
                           "stop_codon") or toks[1] not in ("protein_coding"):
            continue
        #if toks[0] != "1": break
        start, end = map(int, toks[3:5])
        gene = toks[8].split('gene_name "')[1].split('"', 1)[0]
        assert start <= end, toks
        key = toks[0], gene

        #cutoff = 0.3

        # find sections of exon under certain coverage.
        #TODO: comment this if we don't want coverage cutoff filtering
        if coverage_array[start - 1:end].min(
        ) < cutoff:  # doesn't bother to run these operations if there is not one bp below the cutoff
            #splitters[key].add([(start - 1, end)]) #this takes out the whole exon for one section of poor coverage
            a = coverage_array[start - 1:end]
            #print str(start-1),end,a
            is_under, locs = False, []  # generates "locs" for each exon"
            if a[0] < cutoff:
                locs.append([start - 1])
                is_under = True  # so you can initialize is_under
            for pos, v in enumerate(
                    a[1:], start=start
            ):  #enumerates positions in the coverage array starting at the beginning of the exon
                if v < cutoff:
                    if not is_under:
                        is_under = True
                        locs.append(
                            [pos - 1]
                        )  #start, coverage is in bed format, so pos-1 is necessary, since splitters are open left and right side
                else:
                    if is_under:
                        is_under = False
                        locs[-1].append(pos)  #end
            if is_under:
                locs[-1].append(
                    end
                )  # in this case would end splitter at the end of the exon
            splitters[key].add(map(tuple, locs))
            #for i in locs:
            #    f3.write(chrom+"\t"+"\t".join(map(str,i))+"\n")

        for s, e in split_iv.find((start - 1, end)):
            splitters[key].add([(s, e)])

        genes[key].add(
            [(start - 1, end)]
        )  # converts GTF exon coordinates to BED format (subtracts 1 from exon start)
    # sort by start so we can do binary search.
    genes = dict((k, sorted(v._vals)) for k, v in genes.iteritems())
    #ends = dict((k, sorted(v)) for k, v in ends.iteritems())
    splits, starts, ends = {}, {}, {}
    splitters = dict(splitters)
    for chrom_gene, sends in genes.iteritems():
        starts[chrom_gene] = [s[0] for s in sends]
        ends[chrom_gene] = [s[1] for s in sends]
        if chrom_gene in splitters:
            splits[chrom_gene] = splitters[chrom_gene]._vals

    return starts, ends, splits