Ejemplo n.º 1
0
class TestIssue9(unittest.TestCase):
    def setUp(self):
        self.tree4 = IntervalTree()
        self.tree4.insert(Interval(22, 33, data='example1'))
        self.tree4.insert(Interval(22, 33, data='example2'))

    def test_right(self):
        self.assertEqual(0, len(self.tree4.right(Interval(44, 55))))
        self.assertEqual(2, len(self.tree4.right(Interval(11, 12))))

    def test_left(self):
        self.assertEqual(2, len(self.tree4.left(Interval(44, 55))))
        self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
Ejemplo n.º 2
0
class TestIssue9(unittest.TestCase):
    def setUp(self):
        self.tree4 = IntervalTree()
        self.tree4.insert(Interval(22, 33, data='example1'))
        self.tree4.insert(Interval(22, 33, data='example2'))

    def test_right(self):
        self.assertEqual(0, len(self.tree4.right(Interval(44,55))))
        self.assertEqual(2, len(self.tree4.right(Interval(11,12))))

    def test_left(self):
        self.assertEqual(2, len(self.tree4.left(Interval(44,55))))
        self.assertEqual(0, len(self.tree4.left(Interval(11,12))))
Ejemplo n.º 3
0
def _create_intervaltree(locs):

    it = IntervalTree()

    for k, (start, end) in locs.iterrows():

        intervals = it.find(start, end)
        if intervals:
            continue

        it.add(start, end, k)

    return it
Ejemplo n.º 4
0
 def __init__(self,
              content,
              doc_id=rand_id(),
              language=lng.ENGLISH,
              preprocessors=None):
     super().__init__(self, 0, len(content))
     self._content = preprocess(content,
                                preprocessors) if preprocessors else content
     self._annotations = IntervalTree()
     self._doc_id = rand_id(10) if doc_id is None else doc_id
     self._completed = {}
     self._next_id = 0
     self[LANGUAGE] = language
     self._aid_dict = {}
Ejemplo n.º 5
0
def createIntervalTreesFragmentResolution():
    ''' 
    creates one interval tree for quick lookups
    returns 
        fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)]
        intersect_tree - intersect Tree for interval matching
    '''
    
    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree with given resolution for chromosomes matching pattern %s" % (timeStamp(), options.chromPattern)
    
    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}
    fragmentsChrom = {} # lookp table for fragment ranges of a chromosome

    for line in fileinput.input([options.chromSizes]):
        chrom=line.split("\t")[0]
        # check if chromosome needs to be filtered out or not
        if (options.chromPattern != "" and not re.match("^"+options.chromPattern+"$", chrom)):
            # skip this one
            if (options.vverbose):
                print "skipping pattern %s" % (line)
            continue

        fragmentsStart=fragmentsCount
        chromlen=int(line.split("\t")[1])

        for i in range(0, chromlen, options.resolution):
            start=i
            end=min(i+ options.resolution, chromlen)
            interval = Interval(chrom, start, end)
            intersect_tree.insert(interval, fragmentsCount)
            fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))])
            fragmentsCount += 1
            if (options.vverbose):
                print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end)
        
        fragmentsEnd=fragmentsCount
        fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsEnd])
    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (timeStamp())
    
    return [ fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom ]
Ejemplo n.º 6
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Feature(i -4, i + 4, strand=1, chr=ichr)
                a.insert(f)
        
        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Feature(i -4, i + 4, strand=1, chr=ichr)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Ejemplo n.º 7
0
class EmptyTreeTestCase(unittest.TestCase):
    """ test search on an empty tree."""
    def setUp(self):
        self.tree = IntervalTree()

    def test_search(self):
        self.tree.search(46, 47)

    def test_find(self):
        self.tree.find(Interval(46, 47))

    def test_left(self):
        self.tree.left(Interval(46, 47))

    def test_right(self):
        self.tree.right(Interval(46, 47))
Ejemplo n.º 8
0
class EmptyTreeTestCase(unittest.TestCase):
    """ test search on an empty tree."""

    def setUp(self):
        self.tree = IntervalTree()

    def test_search(self):
        self.tree.search(46, 47)

    def test_find(self):
        self.tree.find(Interval(46, 47))

    def test_left(self):
        self.tree.left(Interval(46, 47))

    def test_right(self):
        self.tree.right(Interval(46, 47))
Ejemplo n.º 9
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr( item, "visited", True )
                yield outfields
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen): outfields.append(".")
                yield outfields

    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            outfields = list()
            for x in range(leftlen): outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 10
0
def main(argv):
    if len(argv) < 3:
        print("Usage: bedcov.py <loaded.bed> <streamed.bed>")
        sys.exit(1)

    bed, i = {}, 0
    start = timer()
    with open(argv[1]) as fp:
        for line in fp:
            t = line[:-1].split("\t")
            if not t[0] in bed:
                bed[t[0]] = IntervalTree()
            bed[t[0]].add(int(t[1]) + 1, int(t[2]))
    sys.stderr.write("Read in {} sec\n".format(timer() - start))
    start = timer()
    with open(argv[2]) as fp:
        for line in fp:
            t = line[:-1].split("\t")
            if not t[0] in bed:
                print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
            else:
                r = bed[t[0]].search(int(t[1]) + 1, int(t[2]))
                print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], len(r)))
    sys.stderr.write("Query in {} sec\n".format(timer() - start))
Ejemplo n.º 11
0
 def setUp(self):
     self.tree = IntervalTree()
Ejemplo n.º 12
0
    def test_tree_pickle(self):
        a = IntervalTree()
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                a.insert(f)

        a.dump('a.pkl')

        b = IntervalTree()
        b.load('a.pkl')
        for ichr in range(5):
            for i in range(10, 100, 6):
                f = Interval(i - 4, i + 4)
                af = sorted(a.find(f), key=operator.attrgetter('start'))
                bf = sorted(b.find(f), key=operator.attrgetter('start'))

                assert len(bf) > 0
                self.assertEqual(len(af), len(bf))
                self.assertEqual(af[0].start, bf[0].start)
                self.assertEqual(af[-1].start, bf[-1].start)
Ejemplo n.º 13
0
                         "Start": np.int32,
                         "End": np.int32
                     })

background = pd.read_table(b,
                           sep="\t",
                           usecols=[1, 2],
                           header=None,
                           names="Start End".split(),
                           engine="c",
                           dtype={
                               "Start": np.int32,
                               "End": np.int32
                           })

tree = IntervalTree()
for start_, end_ in zip(background.Start, background.End):
    tree.add(start_, end_)

start = time()
results = []
for start_, end_ in zip(chip.Start, chip.End):
    results.append(tree.search(start_, end_))
end = time()

# print(result)

total = end - start

total_dt = datetime.datetime.fromtimestamp(total)
Ejemplo n.º 14
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()
    
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert( item, rightSet.linenum, item.fields )
            if rightlen == 0: rightlen = item.nfields
            if rightStrandCol == -1: rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if (asfraction==True):
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
                    overlap = interval.end-item.start
                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
                    overlap = item.end-interval.start
                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                yield(getSelectedColumns( interval.fields, item.other, outColumns ))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield(getSelectedColumns( interval.fields, rightlen, outColumns ))
    if leftfill:
        def report_unvisited( node, results ):
            if not hasattr(node, "visited"):
                results.append( node )
        results = []
        rightTree.traverse( lambda x: report_unvisited( x, results ) )
        for item in results:
            yield(getSelectedColumns( leftlen, item.other, outColumns))
Ejemplo n.º 15
0
def join(
    leftSet,
    rightSet,
    mincols=1,
    leftfill=True,
    rightfill=True,
    asfraction=False,
    matchStrand=STRAND_NEUTRAL,
    outColumns=[-1, -1],
):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightStrandCol = -1
    minoverlap = mincols
    rightTree = IntervalTree()

    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0:
                rightlen = item.nfields
            if rightStrandCol == -1:
                rightStrandCol = item.strand_col

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            leftbases = interval.end - interval.start
            for item in result:
                rightbases = item.end - item.start
                if asfraction == True:
                    if rightbases < leftbases:
                        mincols = rightbases
                    else:
                        mincols = leftbases
                    mincols = math.floor(mincols * minoverlap)
                if item.start in range(interval.start, interval.end + 1) and item.end not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end + 1) and item.start not in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end + 1) and item.end in range(
                    interval.start, interval.end + 1
                ):
                    overlap = item.end - item.start
                else:  # the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    # check strand
                    strandMatched = (
                        STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]]
                    )
                    if strandMatched == -1 and matchStrand > 0:
                        # needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if strandMatched == 1 and matchStrand < 0:
                        # needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1):
                        # strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                # strand criteria met
                setattr(item, "visited", True)
                yield (getSelectedColumns(interval.fields, item.other, outColumns))
            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
                yield (getSelectedColumns(interval.fields, rightlen, outColumns))
    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            yield (getSelectedColumns(leftlen, item.other, outColumns))
Ejemplo n.º 16
0
 def add(self, chrom, start, end, val):
     if chrom not in self.chroms:
         self.chroms[chrom] = IntervalTree()
     tree = self.chroms[chrom]
     tree.add(start, end, val)
Ejemplo n.º 17
0
def createIntervalTreesFragmentFile(options):
    '''
        creates one interval tree for quick lookups
        returns
            fragmentsMap[fragmentId] = [tuple(chrom, start, end)]
            intersect_tree - intersect Tree for interval matching

    '''

    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree from fragmented genome" % (
            timeStamp())

    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}
    fragmentsChrom = {}  # lookp table for fragment ranges of a chromosome
    fragmentsStart = 0

    start = 0
    end = 0
    counter = 0
    chrom = ""

    for line in fileinput.input([options.genomeFragmentFile]):
        line = line.strip()
        if (len(line) == 0 or line.startswith("Genome")
                or line.startswith("Chromosome")):
            continue

        cols = line.split("\t")
        try:
            # check if chromosome changed from last
            if (cols[0] != chrom):
                # do we have to finish the last chromosome?
                if (end > 0):
                    interval = Interval(chrom, start, end)
                    intersect_tree.insert(interval, fragmentsCount)
                    fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
                    fragmentsCount += 1

                    fragmentsChrom[chrom] = tuple(
                        [fragmentsStart, fragmentsCount])
                    fragmentsStart = fragmentsCount

                    if (options.vverbose):
                        print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                            chrom, start, end)
                # check if chromosome needs to be filtered out or not
                if (options.chromPattern != ""
                        and not re.match(options.chromPattern, cols[0])):
                    chrom = ""
                    start = 0
                    end = 0

                else:
                    chrom = cols[0]
                    start = int(cols[1])
                    end = int(cols[2])
                counter = 0

            # check if fragment aggregation is fulfilled
            elif (counter >= options.fragmentAggregation):
                interval = Interval(chrom, start, end)
                intersect_tree.insert(interval, fragmentsCount)
                if (options.vverbose):
                    print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                        chrom, start, end)

                fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
                start = int(cols[1])
                end = int(cols[2])
                counter = 0
                fragmentsCount += 1
            else:
                end = int(cols[2])

            # increment counter
            counter += 1

        except:
            if (options.verbose):
                print >> sys.stderr, 'skipping line in options.genomeFragmentFile: %s' % (
                    line)
            if (options.vverbose):
                traceback.print_exc()
                sys.exit(1)

    # handle last fragment
    if (end > 0):
        interval = Interval(chrom, start, end)
        intersect_tree.insert(interval, fragmentsCount)
        fragmentsMap[fragmentsCount] = tuple([chrom, start, end])
        fragmentsCount += 1
        fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount])

        if (options.vverbose):
            print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom,
                                                                   start, end)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (
            timeStamp())

    return [fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom]
Ejemplo n.º 18
0
class Document(HString):
    def __init__(self,
                 content,
                 doc_id=rand_id(),
                 language=lng.ENGLISH,
                 preprocessors=None):
        super().__init__(self, 0, len(content))
        self._content = preprocess(content,
                                   preprocessors) if preprocessors else content
        self._annotations = IntervalTree()
        self._doc_id = rand_id(10) if doc_id is None else doc_id
        self._completed = {}
        self._next_id = 0
        self[LANGUAGE] = language
        self._aid_dict = {}

    @property
    def content(self) -> str:
        return self._content

    @property
    def doc_id(self):
        return self._doc_id

    def annotation(self,
                   annotation_type,
                   start=None,
                   end=None) -> typing.List[Annotation]:
        try:
            if end is None or start is None:
                anno_iter = self._annotations.find(Interval(0, self.end))
            else:
                anno_iter = filter(
                    lambda x: x.data.overlaps(Span(start, end)),
                    self._annotations.find(Interval(start, end)))
        except:
            return []
        if annotation_type:
            annotation_type = annotation_type.lower()
            return sorted([
                x.data for x in anno_iter
                if x.data.annotation_type.lower() == annotation_type
                and x.data != self
            ])
        return sorted([x.data for x in anno_iter if x.data != self])

    def annotation_by_id(self, annotation_id: int):
        return self._aid_dict[
            annotation_id] if annotation_id in self._aid_dict else None

    def previous_annotation(self,
                            annotation: Annotation,
                            annotation_type: str = None) -> 'Annotation':
        if not annotation_type:
            annotation_type = annotation.annotation_type
        a = self.annotation(annotation_type, start=-1, end=annotation.start)
        if len(a) == 0:
            return Annotation(None, 0, 0, annotation_type, [])
        return a[-1]

    def next_annotation(self,
                        annotation: Annotation,
                        annotation_type: str = None) -> 'Annotation':
        if not annotation_type:
            annotation_type = annotation.annotation_type
        a = self.annotation(annotation_type,
                            start=annotation.end,
                            end=self.end)
        if len(a) == 0:
            return Annotation(None, 0, 0, annotation_type, [])
        return a[0]

    def create_annotation(self,
                          type: str,
                          start: int,
                          end: int,
                          attributes=None) -> Annotation:
        if attributes is None:
            attributes = []
        annotation = Annotation(self, start, end, type, attributes,
                                self._next_id)
        self._next_id += 1
        self._annotations.insert(
            Interval(annotation.start, annotation.end, annotation))
        self._aid_dict[annotation.annotation_id] = annotation
        return annotation

    def annotate(self, *args):
        for arg in args:
            if arg in self._completed:
                continue
            self.language().load()
            annotator = self.language().get_annotator(arg)
            if annotator:
                annotator.annotate(self)
                self._completed[arg] = '1.0'
            else:
                raise Exception("No annotator for {} annotations in {}".format(
                    arg, self.language()))

    def language(self):
        if LANGUAGE in self.attributes:
            return self.attributes[LANGUAGE]
        return lng.UNKNOWN

    @staticmethod
    def from_spacy(parsed):
        document = Document(content=str(parsed))
        for token in parsed:
            if token.lemma_.strip() != "":
                t = document.create_annotation(
                    "token", token.idx, token.idx + len(token),
                    [(type.INDEX, token.i), (type.LEMMA, token.lemma_),
                     ("prob", token.prob),
                     (type.PART_OF_SPEECH, PartOfSpeech.of(token.tag_))])
                if token.head is token:
                    head_idx = None
                else:
                    head_idx = token.head.i
                if head_idx:
                    t.add_relation(target=head_idx,
                                   type="dep",
                                   relation=token.dep_)

        for entity in parsed.ents:
            document.create_annotation(type.ENTITY, entity.start_char,
                                       entity.end_char,
                                       [(type.ENTITY_TYPE, entity.label_)])
        for i, sentence in enumerate(parsed.sents):
            document.create_annotation(type.SENTENCE, sentence.start_char,
                                       sentence.end_char, [(type.INDEX, i)])
        for np in parsed.noun_chunks:
            document.create_annotation(
                type.PHRASE_CHUNK, np.start_char, np.end_char,
                [(type.PART_OF_SPEECH, PennTreebank.NP)])

    @staticmethod
    def from_json(json_str):
        doc = Document(content='')
        doc.__read_json(json.loads(json_str))
        return doc

    def __getstate__(self):
        return self.to_dict()

    def __setstate__(self, state):
        self.__read_json(state)

    def __read_json(self, obj):
        self.__init__(content=obj['content'])
        self._doc_id = obj.get('id', self._doc_id)
        for (k, v) in obj.get("attributes", {}).items():
            self[k] = get_decoder(k)(v)
        for (k, v) in obj.get('completed', {}).items():
            self._completed[k] = v
        max_id = -1
        for annotation in obj.get("annotations", []):
            ann = Annotation(
                document=self,
                start=annotation["start"],
                end=annotation["end"],
                annotation_type=annotation["type"],
                attributes=[
                    (k, get_decoder(k)(v))
                    for k, v in annotation.get("attributes", {}).items()
                ],
                annotation_id=annotation["id"])
            max_id = max(max_id, ann.annotation_id)
            self._annotations.add(ann.start, ann.end, ann)
            #self._annotations.add(ann)
            for rel in annotation.get("relations", []):
                ann.add_relation(target=rel["target"],
                                 type=rel["type"],
                                 relation=rel["value"])
        self.language().load()
        self._next_id = max_id + 1

    def to_json(self) -> str:
        return json.dumps(self.to_dict(), default=default)

    def to_dict(self) -> typing.Dict[str, typing.Any]:
        return dict([
            ("id", self._doc_id), ("content", self.content),
            ("attributes", self._attributes), ("completed", self._completed),
            ("annotations",
             [a.as_dict() for a in self.annotation(annotation_type=None)])
        ])
Ejemplo n.º 19
0
 def setUp(self):
     self.tree4 = IntervalTree()
     self.tree4.insert(Interval(22, 33, data='example1'))
     self.tree4.insert(Interval(22, 33, data='example2'))
Ejemplo n.º 20
0
 def setUp(self):
     self.tree = IntervalTree()
Ejemplo n.º 21
0
 def setUp(self):
     self.tree4 = IntervalTree()
     self.tree4.insert(Interval(22, 33, data='example1'))
     self.tree4.insert(Interval(22, 33, data='example2'))
Ejemplo n.º 22
0
from matplotlib import pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from itertools import groupby
from operator import itemgetter
import numpy as np

# create list of tuple intervals for each ccr, pli, and missense z interval and a tree of those intervals
# search each tree with both lists

ccrs = open(sys.argv[1], "rb")  #exacresiduals/gnomad10x.5syn-ccrs.bed.gz
mpcs = open(sys.argv[2], "rb")  #essentials/mpc.regions.clean.sorted.bed.gz
plis = open(sys.argv[3],
            "rb")  #$HOME/software/pathoscore/score-sets/GRCh37/pLI/pLI.bed.gz

# generate data: we want to search the trees with the lists to get the numbers in each venn diagram
ccrtree = defaultdict(lambda: IntervalTree())
ccrlist = defaultdict(list)
sorter = itemgetter(0, 3, 6)
grouper = itemgetter(0, 3, 6)
ccrtemp = []
ccrgenes = set()
pligenes = set()
mpcgenes = set()
ccr99genes = set()
for ccr in ccrs:
    ccr = ccr.strip().split("\t")
    if float(ccr[-1]) < 95: continue
    ccrtemp.append(ccr)
for key, grp in groupby(sorted(ccrtemp, key=sorter), grouper):
    grp = list(grp)
    chrom = grp[0][0]
Ejemplo n.º 23
0
def createIntervalTreesFragmentFile():
    ''' 
        creates one interval tree for quick lookups
        returns 
            fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)]
            intersect_tree - intersect Tree for interval matching
        
    '''
    
    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree from fragmented genome" % (timeStamp())
    
    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}
    fragmentsChrom = {} # lookp table for fragment ranges of a chromosome
    fragmentsStart = 0

    start = 0
    end = 0
    counter = 0
    chrom = ""
    
    for line in fileinput.input([options.fragmentFile]):
        line = line.strip()
        if (len(line)==0 or line.startswith("Genome") or line.startswith("Chromosome")):
            continue
            
        cols = line.split("\t")
        try:
            # check if chromosome changed from last
            if (cols[0] != chrom):
                # do we have do finish the last chromosome?
                if (end > 0):
                    interval = Interval(chrom, start, end)
                    intersect_tree.insert(interval, fragmentsCount)
                    fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))])
                    fragmentsCount += 1

                    fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount]) 
                    fragmentsStart = fragmentsCount

                    if (options.vverbose):
                        print >> sys.stdout,  "-- intervaltree.add %s:%d-%d" % (chrom, start, end)
                # check if chromosome needs to be filtered out or not
                if (options.chromPattern  != "" and not re.match(options.chromPattern, cols[0])):
                    chrom = ""
                    start = 0
                    end = 0

                else:
                    chrom = cols[0]
                    start = int(cols[1])
                    end = int(cols[2])
                counter = 0
    
            # check if fragment aggregation is fulfilled
            elif (counter >= options.fragmentAggregation):
                interval = Interval(chrom, start, end)
                intersect_tree.insert(interval, fragmentsCount)
                if (options.vverbose):
                    print >> sys.stdout,  "-- intervaltree.add %s:%d-%d" % (chrom, start, end)
    
                fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))])
                start = int(cols[1])
                end = int(cols[2])
                counter = 0
                fragmentsCount += 1                
            else:
                end = int(cols[2])
                
            # increment counter
            counter += 1
        
        except:
            if (options.verbose):
                print >> sys.stderr, 'skipping line in options.fragmentFile: %s' % (line)
            if (options.vverbose):
                traceback.print_exc()
                sys.exit(1)
    
    
    # handle last fragment
    if (end > 0):
        interval = Interval(chrom, start, end)
        intersect_tree.insert(interval, fragmentsCount)
        fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))])
        fragmentsCount += 1
        fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount]) 

        if (options.vverbose):
            print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end)
    
    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (timeStamp())
            
    return [fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom]
Ejemplo n.º 24
0
def getpairs(leftSet,
             rightSet,
             leftCol,
             mincols=1,
             asfraction=False,
             matchStrand=STRAND_NEUTRAL,
             skipChrNames=True,
             skipStrandNames=True):
    # Read leftSet into memory:
    leftlen = 0
    rightlen = 0
    leftStrandCol = -1
    minoverlap = mincols
    leftTree = IntervalTree()
    rightCols = list()
    for item in leftSet:
        if type(item) is GenomicInterval:
            leftTree.insert(item, leftSet.linenum, item.fields)
            if leftlen == 0: leftlen = item.nfields
            if leftStrandCol == -1: leftStrandCol = item.strand_col

    for interval in rightSet:
        if rightlen == 0 and type(interval) is GenomicInterval:
            rightlen = interval.nfields
            rightCols = range(rightlen)
            #remove the useless columns
            rightCols.remove(interval.start_col)
            rightCols.remove(interval.end_col)
            if skipChrNames:
                rightCols.remove(interval.chrom_col)
            if skipStrandNames:
                rightCols.remove(interval.strand_col)
        if not (type(interval) is GenomicInterval):
            yield interval
        else:
            result = []
            leftTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            rightbases = interval.end - interval.start
            for item in result:
                leftbases = item.end - item.start
                if (asfraction == True):
                    if leftbases < rightbases:
                        mincols = leftbases
                    else:
                        mincols = rightbases
                    mincols = math.floor(mincols * minoverlap)

                if (item.start >= interval.start and item.start <= interval.end
                    ) and (item.end < interval.start
                           or item.end > interval.end):
                    overlap = interval.end - item.start
                elif (item.end >= interval.start and item.end <= interval.end
                      ) and (item.start < interval.start
                             or item.end > interval.end):
                    overlap = item.end - interval.start
                elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end:
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[
                        interval.strand] * STRAND_INTEGER_VALUES[
                            item.other[leftStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0
                            and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr(item, "visited", True)
                leftTerm = item.other[leftCol]
                for col in rightCols:
                    #take each field that's not a number
                    #split it on semicolons, commas, and spaces
                    #output the word and the leftTerm as being associated
                    #curcol = re.sub("\;|\,","\t",interval.fields[col])
                    curcol = interval.fields[col]
                    lexer = shlex.shlex(curcol)
                    lexer.whitespace = '\t\r\n\,\;'
                    lexer.wordchars += ":'"
                    lexer.whitespace_split = True
                    lexer.quotes = '"'

                    for item in lexer:
                        item = item.strip()
                        if (item == "."): continue
                        try:
                            float(item)
                        except ValueError:
                            yield [item, leftTerm]
Ejemplo n.º 25
0
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
    # Read rightSet into memory:
    rightlen = 0
    leftlen = 0
    rightTree = IntervalTree()
    for item in rightSet:
        if isinstance(item, GenomicInterval):
            rightTree.insert(item, rightSet.linenum, item.fields)
            if rightlen == 0: rightlen = item.nfields

    for interval in leftSet:
        if leftlen == 0 and isinstance(interval, GenomicInterval):
            leftlen = interval.nfields
        if not isinstance(interval, GenomicInterval):
            yield interval
        else:
            result = []
            rightTree.intersect(interval, lambda node: result.append(node))
            overlap_not_met = 0
            for item in result:
                if item.start in range(interval.start, interval.end +
                                       1) and item.end not in range(
                                           interval.start, interval.end + 1):
                    overlap = interval.end - item.start
                elif item.end in range(interval.start, interval.end +
                                       1) and item.start not in range(
                                           interval.start, interval.end + 1):
                    overlap = item.end - interval.start
                elif item.start in range(interval.start, interval.end +
                                         1) and item.end in range(
                                             interval.start, interval.end + 1):
                    overlap = item.end - item.start
                else:  #the intersecting item's start and end are outside the interval range
                    overlap = interval.end - interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                outfields = list(interval)
                map(outfields.append, item.other)
                setattr(item, "visited", True)
                yield outfields
            if (len(result) == 0
                    or overlap_not_met == len(result)) and rightfill:
                outfields = list(interval)
                for x in range(rightlen):
                    outfields.append(".")
                yield outfields

    if leftfill:

        def report_unvisited(node, results):
            if not hasattr(node, "visited"):
                results.append(node)

        results = []
        rightTree.traverse(lambda x: report_unvisited(x, results))
        for item in results:
            outfields = list()
            for x in range(leftlen):
                outfields.append(".")
            map(outfields.append, item.other)
            yield outfields
Ejemplo n.º 26
0
b = snakemake.input.background

background = pd.read_table(b,
                           sep="\t",
                           usecols=[1, 2],
                           header=None,
                           names="Start End".split(),
                           engine="c",
                           dtype={
                               "Start": np.int32,
                               "End": np.int32
                           })

start = time()

tree = IntervalTree()
for start_, end_ in zip(background.Start, background.End):
    tree.add(start_, end_)

end = time()

# print(result)

total = end - start

total_dt = datetime.datetime.fromtimestamp(total)

minutes_seconds = total_dt.strftime('%-M.%-S.%f')

open(snakemake.output[0], "w+").write(minutes_seconds)
Ejemplo n.º 27
0
def createIntervalTrees():
    ''' 
		creates one interval tree for quick lookups
		returns 
			fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)]
			intersect_tree - intersect Tree for interval matching
		
	'''

    if (options.verbose):
        print >> sys.stdout, "- %s START   : populate intervaltree from fragmented genome" % (
            timeStamp())

    intersect_tree = IntervalTree()
    fragmentsCount = 0
    fragmentsMap = {}

    start = 0
    end = 0
    counter = 0
    chrom = ""

    for line in fileinput.input([options.fragmentFile]):
        line = line.strip()
        if (len(line) == 0 or line.startswith("Genome")
                or line.startswith("Chromosome")):
            continue

        cols = line.split("\t")
        try:
            # check if chromosome changed from last
            if (cols[0] != chrom):
                # do we have do finish the last chromosome?
                if (end > 0):
                    interval = Interval(chrom, start, end)
                    intersect_tree.insert(interval, fragmentsCount)
                    fragmentsMap[fragmentsCount] = tuple(
                        [chrom, int(0.5 * (start + end))])
                    fragmentsCount += 1
                    if (options.vverbose):
                        print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                            chrom, start, end)
                chrom = cols[0]
                start = int(cols[1])
                end = int(cols[2])
                counter = 0

            # check if fragement aggregation is fulfilled
            elif (counter >= options.fragmentAggregation):
                interval = Interval(chrom, start, end)
                intersect_tree.insert(interval, fragmentsCount)
                if (options.vverbose):
                    print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (
                        chrom, start, end)

                fragmentsMap[fragmentsCount] = tuple(
                    [chrom, int(0.5 * (start + end))])
                start = int(cols[1])
                end = int(cols[2])
                counter = 0
                fragmentsCount += 1
            else:
                end = int(cols[2])

            # increment counter
            counter += 1

        except:
            if (options.verbose):
                print >> sys.stderr, 'skipping line in options.fragmentFile: %s' % (
                    line)
            if (options.vverbose):
                traceback.print_exc()

    # handle last fragment
    if (end > 0):
        interval = Interval(chrom, start, end)
        intersect_tree.insert(interval, fragmentsCount)
        fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5 * (start + end))])
        fragmentsCount += 1
        if (options.vverbose):
            print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom,
                                                                   start, end)

    if (options.verbose):
        print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (
            timeStamp())

    return [fragmentsMap, intersect_tree]
Ejemplo n.º 28
0
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True):
    # Read leftSet into memory:
    leftlen = 0
    rightlen = 0
    leftStrandCol = -1
    minoverlap = mincols
    leftTree = IntervalTree()
    rightCols = list()
    for item in leftSet:
        if type( item ) is GenomicInterval:
            leftTree.insert( item, leftSet.linenum, item.fields )
            if leftlen == 0: leftlen = item.nfields
            if leftStrandCol == -1: leftStrandCol = item.strand_col

    for interval in rightSet:
        if rightlen == 0 and type( interval ) is GenomicInterval:
            rightlen = interval.nfields
            rightCols = range(rightlen)
            #remove the useless columns
            rightCols.remove( interval.start_col )
            rightCols.remove( interval.end_col )
            if skipChrNames:
                rightCols.remove( interval.chrom_col )
            if skipStrandNames:
                rightCols.remove( interval.strand_col )
        if not (type( interval ) is GenomicInterval):
            yield interval
        else:
            result = []
            leftTree.intersect( interval, lambda node: result.append( node ) )
            overlap_not_met = 0
            rightbases = interval.end - interval.start
            for item in result:
                leftbases = item.end - item.start
                if (asfraction==True):
                    if leftbases < rightbases:
                        mincols = leftbases
                    else:
                        mincols = rightbases
                    mincols = math.floor(mincols * minoverlap)
                    
                if (item.start >= interval.start and item.start <= interval.end) and (item.end < interval.start or item.end > interval.end):
                    overlap = interval.end-item.start
                elif (item.end >= interval.start and item.end <= interval.end) and (item.start < interval.start or item.end > interval.end):
                    overlap = item.end-interval.start
                elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end:
                    overlap = item.end-item.start
                else:   #the intersecting item's start and end are outside the interval range
                    overlap = interval.end-interval.start
                if overlap < mincols:
                    overlap_not_met += 1
                    continue
                else:
                    #check strand
                    strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[leftStrandCol]]
                    if (strandMatched == -1 and matchStrand > 0):
                        #needed match but found a complement
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 1 and matchStrand < 0):
                        #needed complement but found a match
                        overlap_not_met += 1
                        continue
                    if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)):
                        #strict criteria but only permissive match found
                        overlap_not_met += 1
                        continue
                #strand criteria met
                setattr( item, "visited", True )
                leftTerm = item.other[leftCol]
                for col in rightCols:
                    #take each field that's not a number
                    #split it on semicolons, commas, and spaces
                    #output the word and the leftTerm as being associated
                    #curcol = re.sub("\;|\,","\t",interval.fields[col])
                    curcol= interval.fields[col]
                    lexer = shlex.shlex(curcol)
                    lexer.whitespace='\t\r\n\,\;'
                    lexer.wordchars += ":'"
                    lexer.whitespace_split=True
                    lexer.quotes='"'
                        
                    for item in lexer:
                        item = item.strip()
                        if (item == "."): continue
                        try:
                            float(item) 
                        except ValueError:
                            yield [item, leftTerm]