Ejemplo n.º 1
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                a.insert(f)

        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Ejemplo n.º 2
0
def _create_intervaltree(locs):

    it = IntervalTree()

    for k, (start, end) in locs.iterrows():

        intervals = it.find(start, end)
        if intervals:
            continue

        it.add(start, end, k)

    return it
Ejemplo n.º 3
0
 def __init__(self,
              content,
              doc_id=rand_id(),
              language=lng.ENGLISH,
              preprocessors=None):
     super().__init__(self, 0, len(content))
     self._content = preprocess(content,
                                preprocessors) if preprocessors else content
     self._annotations = IntervalTree()
     self._doc_id = rand_id(10) if doc_id is None else doc_id
     self._completed = {}
     self._next_id = 0
     self[LANGUAGE] = language
     self._aid_dict = {}
Ejemplo n.º 4
0
def main(argv):
    if len(argv) < 3:
        print("Usage: bedcov.py <loaded.bed> <streamed.bed>")
        sys.exit(1)

    bed, i = {}, 0
    start = timer()
    with open(argv[1]) as fp:
        for line in fp:
            t = line[:-1].split("\t")
            if not t[0] in bed:
                bed[t[0]] = IntervalTree()
            bed[t[0]].add(int(t[1]) + 1, int(t[2]))
    sys.stderr.write("Read in {} sec\n".format(timer() - start))
    start = timer()
    with open(argv[2]) as fp:
        for line in fp:
            t = line[:-1].split("\t")
            if not t[0] in bed:
                print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
            else:
                r = bed[t[0]].search(int(t[1]) + 1, int(t[2]))
                print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], len(r)))
    sys.stderr.write("Query in {} sec\n".format(timer() - start))
Ejemplo n.º 5
0
 def add(self, chrom, start, end, val):
     if chrom not in self.chroms:
         self.chroms[chrom] = IntervalTree()
     tree = self.chroms[chrom]
     tree.add(start, end, val)
Ejemplo n.º 6
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start, interval.end +
                                       1) and item.end not in range(
                                           interval.start, interval.end + 1):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end +
                                       1) and item.start not in range(
                                           interval.start, interval.end + 1):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end +
                                         1) and item.end in range(
                                             interval.start, interval.end + 1):
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr(item, "visited", True)
                yield outfields
            if (len(result) == 0
                    or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen):
                    outfields.append(".")
                yield outfields

    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            outfields = list()
            for x in range(leftlen):
                outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 7
0
 def setUp(self):
     self.tree4 = IntervalTree()
     self.tree4.insert(Interval(22, 33, data='example1'))
     self.tree4.insert(Interval(22, 33, data='example2'))
Ejemplo n.º 8
0
 def setUp(self):
     self.tree = IntervalTree()
Ejemplo n.º 9
0
                         "Start": np.int32,
                         "End": np.int32
                     })

background = pd.read_table(b,
                           sep="\t",
                           usecols=[1, 2],
                           header=None,
                           names="Start End".split(),
                           engine="c",
                           dtype={
                               "Start": np.int32,
                               "End": np.int32
                           })

tree = IntervalTree()
for start_, end_ in zip(background.Start, background.End):
    tree.add(start_, end_)

start = time()
results = []
for start_, end_ in zip(chip.Start, chip.End):
    results.append(tree.search(start_, end_))
end = time()

# print(result)

total = end - start

total_dt = datetime.datetime.fromtimestamp(total)
Ejemplo n.º 10
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()
    
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields
            if rightStrandCol == -1: rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if (asfraction==True):
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                yield(getSelectedColumns( interval.fields, item.other, outColumns ))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield(getSelectedColumns( interval.fields, rightlen, outColumns ))
    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            yield(getSelectedColumns( leftlen, item.other, outColumns))
Ejemplo n.º 11
0
def createIntervalTrees():
    ''' 
		creates one interval tree for quick lookups
		returns 
			fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)]
			intersect_tree - intersect Tree for interval matching
		
	'''

    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree from fragmented genome" % (
            timeStamp())

    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}

    start = 0
    end = 0
    counter = 0
    chrom = ""

    for line in fileinput.input([options.fragmentFile]):
        line = line.strip()
        if (len(line) == 0 or line.startswith("Genome")
                or line.startswith("Chromosome")):
            continue

        cols = line.split("\t")
        try:
            # check if chromosome changed from last
            if (cols[0] != chrom):
                # do we have do finish the last chromosome?
                if (end > 0):
                    interval = Interval(chrom, start, end)
                    intersect_tree.insert(interval, fragmentsCount)
                    fragmentsMap[fragmentsCount] = tuple(
                        [chrom, int(0.5 * (start + end))])
                    fragmentsCount += 1
                    if (options.vverbose):
                        print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                            chrom, start, end)
                chrom = cols[0]
                start = int(cols[1])
                end = int(cols[2])
                counter = 0

            # check if fragement aggregation is fulfilled
            elif (counter >= options.fragmentAggregation):
                interval = Interval(chrom, start, end)
                intersect_tree.insert(interval, fragmentsCount)
                if (options.vverbose):
                    print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                        chrom, start, end)

                fragmentsMap[fragmentsCount] = tuple(
                    [chrom, int(0.5 * (start + end))])
                start = int(cols[1])
                end = int(cols[2])
                counter = 0
                fragmentsCount += 1
            else:
                end = int(cols[2])

            # increment counter
            counter += 1

        except:
            if (options.verbose):
                print >> sys.stderr, 'skipping line in options.fragmentFile: %s' % (
                    line)
            if (options.vverbose):
                traceback.print_exc()

    # handle last fragment
    if (end > 0):
        interval = Interval(chrom, start, end)
        intersect_tree.insert(interval, fragmentsCount)
        fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5 * (start + end))])
        fragmentsCount += 1
        if (options.vverbose):
            print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom,
                                                                   start, end)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (
            timeStamp())

    return [fragmentsMap, intersect_tree]
Ejemplo n.º 12
0
def getpairs(leftSet,
             rightSet,
             leftCol,
             mincols=1,
             asfraction=False,
             matchStrand=STRAND_NEUTRAL,
             skipChrNames=True,
             skipStrandNames=True):
    # Read leftSet into memory:
    leftlen = 0
    rightlen = 0
    leftStrandCol = -1
    minoverlap = mincols
    leftTree = IntervalTree()
    rightCols = list()
    for item in leftSet:
        if type(item) is GenomicInterval:
            leftTree.insert(item, leftSet.linenum, item.fields)
            if leftlen == 0: leftlen = item.nfields
            if leftStrandCol == -1: leftStrandCol = item.strand_col

    for interval in rightSet:
        if rightlen == 0 and type(interval) is GenomicInterval:
            rightlen = interval.nfields
            rightCols = range(rightlen)
            #remove the useless columns
            rightCols.remove(interval.start_col)
            rightCols.remove(interval.end_col)
            if skipChrNames:
                rightCols.remove(interval.chrom_col)
            if skipStrandNames:
                rightCols.remove(interval.strand_col)
        if not (type(interval) is GenomicInterval):
            yield interval
        else:
            result = []
            leftTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            rightbases = interval.end - interval.start
            for item in result:
                leftbases = item.end - item.start
                if (asfraction == True):
                    if leftbases < rightbases:
                        mincols = leftbases
                    else:
                        mincols = rightbases
                    mincols = math.floor(mincols * minoverlap)

                if (item.start >= interval.start and item.start <= interval.end
                    ) and (item.end < interval.start
                           or item.end > interval.end):
                    overlap = interval.end - item.start
                elif (item.end >= interval.start and item.end <= interval.end
                      ) and (item.start < interval.start
                             or item.end > interval.end):
                    overlap = item.end - interval.start
                elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end:
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[
                        interval.strand] * STRAND_INTEGER_VALUES[
                            item.other[leftStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0
                            and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr(item, "visited", True)
                leftTerm = item.other[leftCol]
                for col in rightCols:
                    #take each field that's not a number
                    #split it on semicolons, commas, and spaces
                    #output the word and the leftTerm as being associated
                    #curcol = re.sub("\;|\,","\t",interval.fields[col])
                    curcol = interval.fields[col]
                    lexer = shlex.shlex(curcol)
                    lexer.whitespace = '\t\r\n\,\;'
                    lexer.wordchars += ":'"
                    lexer.whitespace_split = True
                    lexer.quotes = '"'

                    for item in lexer:
                        item = item.strip()
                        if (item == "."): continue
                        try:
                            float(item)
                        except ValueError:
                            yield [item, leftTerm]
Ejemplo n.º 13
0
from matplotlib import pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from itertools import groupby
from operator import itemgetter
import numpy as np

# create list of tuple intervals for each ccr, pli, and missense z interval and a tree of those intervals
# search each tree with both lists

ccrs = open(sys.argv[1], "rb")  #exacresiduals/gnomad10x.5syn-ccrs.bed.gz
mpcs = open(sys.argv[2], "rb")  #essentials/mpc.regions.clean.sorted.bed.gz
plis = open(sys.argv[3],
            "rb")  #$HOME/software/pathoscore/score-sets/GRCh37/pLI/pLI.bed.gz

# generate data: we want to search the trees with the lists to get the numbers in each venn diagram
ccrtree = defaultdict(lambda: IntervalTree())
ccrlist = defaultdict(list)
sorter = itemgetter(0, 3, 6)
grouper = itemgetter(0, 3, 6)
ccrtemp = []
ccrgenes = set()
pligenes = set()
mpcgenes = set()
ccr99genes = set()
for ccr in ccrs:
    ccr = ccr.strip().split("\t")
    if float(ccr[-1]) < 95: continue
    ccrtemp.append(ccr)
for key, grp in groupby(sorted(ccrtemp, key=sorter), grouper):
    grp = list(grp)
    chrom = grp[0][0]
Ejemplo n.º 14
0
def createIntervalTreesFragmentFile(options):
    '''
        creates one interval tree for quick lookups
        returns
            fragmentsMap[fragmentId] = [tuple(chrom, start, end)]
            intersect_tree - intersect Tree for interval matching

    '''

    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree from fragmented genome" % (
            timeStamp())

    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}
    fragmentsChrom = {}  # lookp table for fragment ranges of a chromosome
    fragmentsStart = 0

    start = 0
    end = 0
    counter = 0
    chrom = ""

    for line in fileinput.input([options.genomeFragmentFile]):
        line = line.strip()
        if (len(line) == 0 or line.startswith("Genome")
                or line.startswith("Chromosome")):
            continue

        cols = line.split("\t")
        try:
            # check if chromosome changed from last
            if (cols[0] != chrom):
                # do we have to finish the last chromosome?
                if (end > 0):
                    interval = Interval(chrom, start, end)
                    intersect_tree.insert(interval, fragmentsCount)
                    fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
                    fragmentsCount += 1

                    fragmentsChrom[chrom] = tuple(
                        [fragmentsStart, fragmentsCount])
                    fragmentsStart = fragmentsCount

                    if (options.vverbose):
                        print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                            chrom, start, end)
                # check if chromosome needs to be filtered out or not
                if (options.chromPattern != ""
                        and not re.match(options.chromPattern, cols[0])):
                    chrom = ""
                    start = 0
                    end = 0

                else:
                    chrom = cols[0]
                    start = int(cols[1])
                    end = int(cols[2])
                counter = 0

            # check if fragment aggregation is fulfilled
            elif (counter >= options.fragmentAggregation):
                interval = Interval(chrom, start, end)
                intersect_tree.insert(interval, fragmentsCount)
                if (options.vverbose):
                    print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                        chrom, start, end)

                fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
                start = int(cols[1])
                end = int(cols[2])
                counter = 0
                fragmentsCount += 1
            else:
                end = int(cols[2])

            # increment counter
            counter += 1

        except:
            if (options.verbose):
                print >> sys.stderr, 'skipping line in options.genomeFragmentFile: %s' % (
                    line)
            if (options.vverbose):
                traceback.print_exc()
                sys.exit(1)

    # handle last fragment
    if (end > 0):
        interval = Interval(chrom, start, end)
        intersect_tree.insert(interval, fragmentsCount)
        fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
        fragmentsCount += 1
        fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount])

        if (options.vverbose):
            print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom,
                                                                   start, end)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (
            timeStamp())

    return [fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom]