class TestIssue9(unittest.TestCase): def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2')) def test_right(self): self.assertEqual(0, len(self.tree4.right(Interval(44, 55)))) self.assertEqual(2, len(self.tree4.right(Interval(11, 12)))) def test_left(self): self.assertEqual(2, len(self.tree4.left(Interval(44, 55)))) self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
class TestIssue9(unittest.TestCase): def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2')) def test_right(self): self.assertEqual(0, len(self.tree4.right(Interval(44,55)))) self.assertEqual(2, len(self.tree4.right(Interval(11,12)))) def test_left(self): self.assertEqual(2, len(self.tree4.left(Interval(44,55)))) self.assertEqual(0, len(self.tree4.left(Interval(11,12))))
def _create_intervaltree(locs): it = IntervalTree() for k, (start, end) in locs.iterrows(): intervals = it.find(start, end) if intervals: continue it.add(start, end, k) return it
def __init__(self, content, doc_id=rand_id(), language=lng.ENGLISH, preprocessors=None): super().__init__(self, 0, len(content)) self._content = preprocess(content, preprocessors) if preprocessors else content self._annotations = IntervalTree() self._doc_id = rand_id(10) if doc_id is None else doc_id self._completed = {} self._next_id = 0 self[LANGUAGE] = language self._aid_dict = {}
def createIntervalTreesFragmentResolution(): ''' creates one interval tree for quick lookups returns fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)] intersect_tree - intersect Tree for interval matching ''' if (options.verbose): print >> sys.stdout, "- %s START : populate intervaltree with given resolution for chromosomes matching pattern %s" % (timeStamp(), options.chromPattern) intersect_tree = IntervalTree() fragmentsCount = 0 fragmentsMap = {} fragmentsChrom = {} # lookp table for fragment ranges of a chromosome for line in fileinput.input([options.chromSizes]): chrom=line.split("\t")[0] # check if chromosome needs to be filtered out or not if (options.chromPattern != "" and not re.match("^"+options.chromPattern+"$", chrom)): # skip this one if (options.vverbose): print "skipping pattern %s" % (line) continue fragmentsStart=fragmentsCount chromlen=int(line.split("\t")[1]) for i in range(0, chromlen, options.resolution): start=i end=min(i+ options.resolution, chromlen) interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))]) fragmentsCount += 1 if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) fragmentsEnd=fragmentsCount fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsEnd]) if (options.verbose): print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (timeStamp()) return [ fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom ]
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Feature(i -4, i + 4, strand=1, chr=ichr) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Feature(i -4, i + 4, strand=1, chr=ichr) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
class EmptyTreeTestCase(unittest.TestCase): """ test search on an empty tree.""" def setUp(self): self.tree = IntervalTree() def test_search(self): self.tree.search(46, 47) def test_find(self): self.tree.find(Interval(46, 47)) def test_left(self): self.tree.left(Interval(46, 47)) def test_right(self): self.tree.right(Interval(46, 47))
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 for item in result: if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr( item, "visited", True ) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
def main(argv): if len(argv) < 3: print("Usage: bedcov.py <loaded.bed> <streamed.bed>") sys.exit(1) bed, i = {}, 0 start = timer() with open(argv[1]) as fp: for line in fp: t = line[:-1].split("\t") if not t[0] in bed: bed[t[0]] = IntervalTree() bed[t[0]].add(int(t[1]) + 1, int(t[2])) sys.stderr.write("Read in {} sec\n".format(timer() - start)) start = timer() with open(argv[2]) as fp: for line in fp: t = line[:-1].split("\t") if not t[0] in bed: print("{}\t{}\t{}\t0".format(t[0], t[1], t[2])) else: r = bed[t[0]].search(int(t[1]) + 1, int(t[2])) print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], len(r))) sys.stderr.write("Query in {} sec\n".format(timer() - start))
def setUp(self): self.tree = IntervalTree()
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
"Start": np.int32, "End": np.int32 }) background = pd.read_table(b, sep="\t", usecols=[1, 2], header=None, names="Start End".split(), engine="c", dtype={ "Start": np.int32, "End": np.int32 }) tree = IntervalTree() for start_, end_ in zip(background.Start, background.End): tree.add(start_, end_) start = time() results = [] for start_, end_ in zip(chip.Start, chip.End): results.append(tree.search(start_, end_)) end = time() # print(result) total = end - start total_dt = datetime.datetime.fromtimestamp(total)
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1,-1]): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert( item, rightSet.linenum, item.fields ) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if (asfraction==True): if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): overlap = interval.end-item.start elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): overlap = item.end-interval.start elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr( item, "visited", True ) yield(getSelectedColumns( interval.fields, item.other, outColumns )) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield(getSelectedColumns( interval.fields, rightlen, outColumns )) if leftfill: def report_unvisited( node, results ): if not hasattr(node, "visited"): results.append( node ) results = [] rightTree.traverse( lambda x: report_unvisited( x, results ) ) for item in results: yield(getSelectedColumns( leftlen, item.other, outColumns))
def join( leftSet, rightSet, mincols=1, leftfill=True, rightfill=True, asfraction=False, matchStrand=STRAND_NEUTRAL, outColumns=[-1, -1], ): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightStrandCol = -1 minoverlap = mincols rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields if rightStrandCol == -1: rightStrandCol = item.strand_col for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 leftbases = interval.end - interval.start for item in result: rightbases = item.end - item.start if asfraction == True: if rightbases < leftbases: mincols = rightbases else: mincols = leftbases mincols = math.floor(mincols * minoverlap) if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1 ): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1 ): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1 ): overlap = item.end - item.start else: # the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue else: # check strand strandMatched = ( STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[rightStrandCol]] ) if strandMatched == -1 and matchStrand > 0: # needed match but found a complement overlap_not_met += 1 continue if strandMatched == 1 and matchStrand < 0: # needed complement but found a match overlap_not_met += 1 continue if strandMatched == 0 and (matchStrand < -1 or matchStrand > 1): # strict criteria but only permissive match found overlap_not_met += 1 continue # strand criteria met setattr(item, "visited", True) yield (getSelectedColumns(interval.fields, item.other, outColumns)) if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: yield (getSelectedColumns(interval.fields, rightlen, outColumns)) if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: yield (getSelectedColumns(leftlen, item.other, outColumns))
def add(self, chrom, start, end, val): if chrom not in self.chroms: self.chroms[chrom] = IntervalTree() tree = self.chroms[chrom] tree.add(start, end, val)
def createIntervalTreesFragmentFile(options): ''' creates one interval tree for quick lookups returns fragmentsMap[fragmentId] = [tuple(chrom, start, end)] intersect_tree - intersect Tree for interval matching ''' if (options.verbose): print >> sys.stdout, "- %s START : populate intervaltree from fragmented genome" % ( timeStamp()) intersect_tree = IntervalTree() fragmentsCount = 0 fragmentsMap = {} fragmentsChrom = {} # lookp table for fragment ranges of a chromosome fragmentsStart = 0 start = 0 end = 0 counter = 0 chrom = "" for line in fileinput.input([options.genomeFragmentFile]): line = line.strip() if (len(line) == 0 or line.startswith("Genome") or line.startswith("Chromosome")): continue cols = line.split("\t") try: # check if chromosome changed from last if (cols[0] != chrom): # do we have to finish the last chromosome? if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, start, end]) fragmentsCount += 1 fragmentsChrom[chrom] = tuple( [fragmentsStart, fragmentsCount]) fragmentsStart = fragmentsCount if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % ( chrom, start, end) # check if chromosome needs to be filtered out or not if (options.chromPattern != "" and not re.match(options.chromPattern, cols[0])): chrom = "" start = 0 end = 0 else: chrom = cols[0] start = int(cols[1]) end = int(cols[2]) counter = 0 # check if fragment aggregation is fulfilled elif (counter >= options.fragmentAggregation): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % ( chrom, start, end) fragmentsMap[fragmentsCount] = tuple([chrom, start, end]) start = int(cols[1]) end = int(cols[2]) counter = 0 fragmentsCount += 1 else: end = int(cols[2]) # increment counter counter += 1 except: if (options.verbose): print >> sys.stderr, 'skipping line in options.genomeFragmentFile: %s' % ( line) if (options.vverbose): traceback.print_exc() sys.exit(1) # handle last fragment if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, start, end]) fragmentsCount += 1 fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount]) if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) if (options.verbose): print >> sys.stdout, "- %s FINISHED: intervaltree populated" % ( timeStamp()) return [fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom]
class Document(HString): def __init__(self, content, doc_id=rand_id(), language=lng.ENGLISH, preprocessors=None): super().__init__(self, 0, len(content)) self._content = preprocess(content, preprocessors) if preprocessors else content self._annotations = IntervalTree() self._doc_id = rand_id(10) if doc_id is None else doc_id self._completed = {} self._next_id = 0 self[LANGUAGE] = language self._aid_dict = {} @property def content(self) -> str: return self._content @property def doc_id(self): return self._doc_id def annotation(self, annotation_type, start=None, end=None) -> typing.List[Annotation]: try: if end is None or start is None: anno_iter = self._annotations.find(Interval(0, self.end)) else: anno_iter = filter( lambda x: x.data.overlaps(Span(start, end)), self._annotations.find(Interval(start, end))) except: return [] if annotation_type: annotation_type = annotation_type.lower() return sorted([ x.data for x in anno_iter if x.data.annotation_type.lower() == annotation_type and x.data != self ]) return sorted([x.data for x in anno_iter if x.data != self]) def annotation_by_id(self, annotation_id: int): return self._aid_dict[ annotation_id] if annotation_id in self._aid_dict else None def previous_annotation(self, annotation: Annotation, annotation_type: str = None) -> 'Annotation': if not annotation_type: annotation_type = annotation.annotation_type a = self.annotation(annotation_type, start=-1, end=annotation.start) if len(a) == 0: return Annotation(None, 0, 0, annotation_type, []) return a[-1] def next_annotation(self, annotation: Annotation, annotation_type: str = None) -> 'Annotation': if not annotation_type: annotation_type = annotation.annotation_type a = self.annotation(annotation_type, start=annotation.end, end=self.end) if len(a) == 0: return Annotation(None, 0, 0, annotation_type, []) return a[0] def create_annotation(self, type: str, start: int, end: int, attributes=None) -> Annotation: if attributes is None: attributes = [] annotation = Annotation(self, start, end, type, attributes, self._next_id) self._next_id += 1 self._annotations.insert( Interval(annotation.start, annotation.end, annotation)) self._aid_dict[annotation.annotation_id] = annotation return annotation def annotate(self, *args): for arg in args: if arg in self._completed: continue self.language().load() annotator = self.language().get_annotator(arg) if annotator: annotator.annotate(self) self._completed[arg] = '1.0' else: raise Exception("No annotator for {} annotations in {}".format( arg, self.language())) def language(self): if LANGUAGE in self.attributes: return self.attributes[LANGUAGE] return lng.UNKNOWN @staticmethod def from_spacy(parsed): document = Document(content=str(parsed)) for token in parsed: if token.lemma_.strip() != "": t = document.create_annotation( "token", token.idx, token.idx + len(token), [(type.INDEX, token.i), (type.LEMMA, token.lemma_), ("prob", token.prob), (type.PART_OF_SPEECH, PartOfSpeech.of(token.tag_))]) if token.head is token: head_idx = None else: head_idx = token.head.i if head_idx: t.add_relation(target=head_idx, type="dep", relation=token.dep_) for entity in parsed.ents: document.create_annotation(type.ENTITY, entity.start_char, entity.end_char, [(type.ENTITY_TYPE, entity.label_)]) for i, sentence in enumerate(parsed.sents): document.create_annotation(type.SENTENCE, sentence.start_char, sentence.end_char, [(type.INDEX, i)]) for np in parsed.noun_chunks: document.create_annotation( type.PHRASE_CHUNK, np.start_char, np.end_char, [(type.PART_OF_SPEECH, PennTreebank.NP)]) @staticmethod def from_json(json_str): doc = Document(content='') doc.__read_json(json.loads(json_str)) return doc def __getstate__(self): return self.to_dict() def __setstate__(self, state): self.__read_json(state) def __read_json(self, obj): self.__init__(content=obj['content']) self._doc_id = obj.get('id', self._doc_id) for (k, v) in obj.get("attributes", {}).items(): self[k] = get_decoder(k)(v) for (k, v) in obj.get('completed', {}).items(): self._completed[k] = v max_id = -1 for annotation in obj.get("annotations", []): ann = Annotation( document=self, start=annotation["start"], end=annotation["end"], annotation_type=annotation["type"], attributes=[ (k, get_decoder(k)(v)) for k, v in annotation.get("attributes", {}).items() ], annotation_id=annotation["id"]) max_id = max(max_id, ann.annotation_id) self._annotations.add(ann.start, ann.end, ann) #self._annotations.add(ann) for rel in annotation.get("relations", []): ann.add_relation(target=rel["target"], type=rel["type"], relation=rel["value"]) self.language().load() self._next_id = max_id + 1 def to_json(self) -> str: return json.dumps(self.to_dict(), default=default) def to_dict(self) -> typing.Dict[str, typing.Any]: return dict([ ("id", self._doc_id), ("content", self.content), ("attributes", self._attributes), ("completed", self._completed), ("annotations", [a.as_dict() for a in self.annotation(annotation_type=None)]) ])
def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2'))
from matplotlib import pyplot as plt from matplotlib_venn import venn3, venn3_circles, venn3_unweighted from itertools import groupby from operator import itemgetter import numpy as np # create list of tuple intervals for each ccr, pli, and missense z interval and a tree of those intervals # search each tree with both lists ccrs = open(sys.argv[1], "rb") #exacresiduals/gnomad10x.5syn-ccrs.bed.gz mpcs = open(sys.argv[2], "rb") #essentials/mpc.regions.clean.sorted.bed.gz plis = open(sys.argv[3], "rb") #$HOME/software/pathoscore/score-sets/GRCh37/pLI/pLI.bed.gz # generate data: we want to search the trees with the lists to get the numbers in each venn diagram ccrtree = defaultdict(lambda: IntervalTree()) ccrlist = defaultdict(list) sorter = itemgetter(0, 3, 6) grouper = itemgetter(0, 3, 6) ccrtemp = [] ccrgenes = set() pligenes = set() mpcgenes = set() ccr99genes = set() for ccr in ccrs: ccr = ccr.strip().split("\t") if float(ccr[-1]) < 95: continue ccrtemp.append(ccr) for key, grp in groupby(sorted(ccrtemp, key=sorter), grouper): grp = list(grp) chrom = grp[0][0]
def createIntervalTreesFragmentFile(): ''' creates one interval tree for quick lookups returns fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)] intersect_tree - intersect Tree for interval matching ''' if (options.verbose): print >> sys.stdout, "- %s START : populate intervaltree from fragmented genome" % (timeStamp()) intersect_tree = IntervalTree() fragmentsCount = 0 fragmentsMap = {} fragmentsChrom = {} # lookp table for fragment ranges of a chromosome fragmentsStart = 0 start = 0 end = 0 counter = 0 chrom = "" for line in fileinput.input([options.fragmentFile]): line = line.strip() if (len(line)==0 or line.startswith("Genome") or line.startswith("Chromosome")): continue cols = line.split("\t") try: # check if chromosome changed from last if (cols[0] != chrom): # do we have do finish the last chromosome? if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))]) fragmentsCount += 1 fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount]) fragmentsStart = fragmentsCount if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) # check if chromosome needs to be filtered out or not if (options.chromPattern != "" and not re.match(options.chromPattern, cols[0])): chrom = "" start = 0 end = 0 else: chrom = cols[0] start = int(cols[1]) end = int(cols[2]) counter = 0 # check if fragment aggregation is fulfilled elif (counter >= options.fragmentAggregation): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))]) start = int(cols[1]) end = int(cols[2]) counter = 0 fragmentsCount += 1 else: end = int(cols[2]) # increment counter counter += 1 except: if (options.verbose): print >> sys.stderr, 'skipping line in options.fragmentFile: %s' % (line) if (options.vverbose): traceback.print_exc() sys.exit(1) # handle last fragment if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5*(start+end))]) fragmentsCount += 1 fragmentsChrom[chrom] = tuple([fragmentsStart, fragmentsCount]) if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) if (options.verbose): print >> sys.stdout, "- %s FINISHED: intervaltree populated" % (timeStamp()) return [fragmentsMap, intersect_tree, fragmentsCount, fragmentsChrom]
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True): # Read leftSet into memory: leftlen = 0 rightlen = 0 leftStrandCol = -1 minoverlap = mincols leftTree = IntervalTree() rightCols = list() for item in leftSet: if type(item) is GenomicInterval: leftTree.insert(item, leftSet.linenum, item.fields) if leftlen == 0: leftlen = item.nfields if leftStrandCol == -1: leftStrandCol = item.strand_col for interval in rightSet: if rightlen == 0 and type(interval) is GenomicInterval: rightlen = interval.nfields rightCols = range(rightlen) #remove the useless columns rightCols.remove(interval.start_col) rightCols.remove(interval.end_col) if skipChrNames: rightCols.remove(interval.chrom_col) if skipStrandNames: rightCols.remove(interval.strand_col) if not (type(interval) is GenomicInterval): yield interval else: result = [] leftTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 rightbases = interval.end - interval.start for item in result: leftbases = item.end - item.start if (asfraction == True): if leftbases < rightbases: mincols = leftbases else: mincols = rightbases mincols = math.floor(mincols * minoverlap) if (item.start >= interval.start and item.start <= interval.end ) and (item.end < interval.start or item.end > interval.end): overlap = interval.end - item.start elif (item.end >= interval.start and item.end <= interval.end ) and (item.start < interval.start or item.end > interval.end): overlap = item.end - interval.start elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end: overlap = item.end - item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[ interval.strand] * STRAND_INTEGER_VALUES[ item.other[leftStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr(item, "visited", True) leftTerm = item.other[leftCol] for col in rightCols: #take each field that's not a number #split it on semicolons, commas, and spaces #output the word and the leftTerm as being associated #curcol = re.sub("\;|\,","\t",interval.fields[col]) curcol = interval.fields[col] lexer = shlex.shlex(curcol) lexer.whitespace = '\t\r\n\,\;' lexer.wordchars += ":'" lexer.whitespace_split = True lexer.quotes = '"' for item in lexer: item = item.strip() if (item == "."): continue try: float(item) except ValueError: yield [item, leftTerm]
def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): # Read rightSet into memory: rightlen = 0 leftlen = 0 rightTree = IntervalTree() for item in rightSet: if isinstance(item, GenomicInterval): rightTree.insert(item, rightSet.linenum, item.fields) if rightlen == 0: rightlen = item.nfields for interval in leftSet: if leftlen == 0 and isinstance(interval, GenomicInterval): leftlen = interval.nfields if not isinstance(interval, GenomicInterval): yield interval else: result = [] rightTree.intersect(interval, lambda node: result.append(node)) overlap_not_met = 0 for item in result: if item.start in range(interval.start, interval.end + 1) and item.end not in range( interval.start, interval.end + 1): overlap = interval.end - item.start elif item.end in range(interval.start, interval.end + 1) and item.start not in range( interval.start, interval.end + 1): overlap = item.end - interval.start elif item.start in range(interval.start, interval.end + 1) and item.end in range( interval.start, interval.end + 1): overlap = item.end - item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end - interval.start if overlap < mincols: overlap_not_met += 1 continue outfields = list(interval) map(outfields.append, item.other) setattr(item, "visited", True) yield outfields if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: outfields = list(interval) for x in range(rightlen): outfields.append(".") yield outfields if leftfill: def report_unvisited(node, results): if not hasattr(node, "visited"): results.append(node) results = [] rightTree.traverse(lambda x: report_unvisited(x, results)) for item in results: outfields = list() for x in range(leftlen): outfields.append(".") map(outfields.append, item.other) yield outfields
b = snakemake.input.background background = pd.read_table(b, sep="\t", usecols=[1, 2], header=None, names="Start End".split(), engine="c", dtype={ "Start": np.int32, "End": np.int32 }) start = time() tree = IntervalTree() for start_, end_ in zip(background.Start, background.End): tree.add(start_, end_) end = time() # print(result) total = end - start total_dt = datetime.datetime.fromtimestamp(total) minutes_seconds = total_dt.strftime('%-M.%-S.%f') open(snakemake.output[0], "w+").write(minutes_seconds)
def createIntervalTrees(): ''' creates one interval tree for quick lookups returns fragmentsMap[fragmentId] = [tuple(chrom, fragmentMidPoint)] intersect_tree - intersect Tree for interval matching ''' if (options.verbose): print >> sys.stdout, "- %s START : populate intervaltree from fragmented genome" % ( timeStamp()) intersect_tree = IntervalTree() fragmentsCount = 0 fragmentsMap = {} start = 0 end = 0 counter = 0 chrom = "" for line in fileinput.input([options.fragmentFile]): line = line.strip() if (len(line) == 0 or line.startswith("Genome") or line.startswith("Chromosome")): continue cols = line.split("\t") try: # check if chromosome changed from last if (cols[0] != chrom): # do we have do finish the last chromosome? if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple( [chrom, int(0.5 * (start + end))]) fragmentsCount += 1 if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % ( chrom, start, end) chrom = cols[0] start = int(cols[1]) end = int(cols[2]) counter = 0 # check if fragement aggregation is fulfilled elif (counter >= options.fragmentAggregation): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % ( chrom, start, end) fragmentsMap[fragmentsCount] = tuple( [chrom, int(0.5 * (start + end))]) start = int(cols[1]) end = int(cols[2]) counter = 0 fragmentsCount += 1 else: end = int(cols[2]) # increment counter counter += 1 except: if (options.verbose): print >> sys.stderr, 'skipping line in options.fragmentFile: %s' % ( line) if (options.vverbose): traceback.print_exc() # handle last fragment if (end > 0): interval = Interval(chrom, start, end) intersect_tree.insert(interval, fragmentsCount) fragmentsMap[fragmentsCount] = tuple([chrom, int(0.5 * (start + end))]) fragmentsCount += 1 if (options.vverbose): print >> sys.stdout, "-- intervaltree.add %s:%d-%d" % (chrom, start, end) if (options.verbose): print >> sys.stdout, "- %s FINISHED: intervaltree populated" % ( timeStamp()) return [fragmentsMap, intersect_tree]
def getpairs(leftSet, rightSet, leftCol, mincols=1, asfraction=False, matchStrand=STRAND_NEUTRAL, skipChrNames=True, skipStrandNames=True): # Read leftSet into memory: leftlen = 0 rightlen = 0 leftStrandCol = -1 minoverlap = mincols leftTree = IntervalTree() rightCols = list() for item in leftSet: if type( item ) is GenomicInterval: leftTree.insert( item, leftSet.linenum, item.fields ) if leftlen == 0: leftlen = item.nfields if leftStrandCol == -1: leftStrandCol = item.strand_col for interval in rightSet: if rightlen == 0 and type( interval ) is GenomicInterval: rightlen = interval.nfields rightCols = range(rightlen) #remove the useless columns rightCols.remove( interval.start_col ) rightCols.remove( interval.end_col ) if skipChrNames: rightCols.remove( interval.chrom_col ) if skipStrandNames: rightCols.remove( interval.strand_col ) if not (type( interval ) is GenomicInterval): yield interval else: result = [] leftTree.intersect( interval, lambda node: result.append( node ) ) overlap_not_met = 0 rightbases = interval.end - interval.start for item in result: leftbases = item.end - item.start if (asfraction==True): if leftbases < rightbases: mincols = leftbases else: mincols = rightbases mincols = math.floor(mincols * minoverlap) if (item.start >= interval.start and item.start <= interval.end) and (item.end < interval.start or item.end > interval.end): overlap = interval.end-item.start elif (item.end >= interval.start and item.end <= interval.end) and (item.start < interval.start or item.end > interval.end): overlap = item.end-interval.start elif item.start >= interval.start and item.start <= interval.end and item.end >= interval.start and item.end <= interval.end: overlap = item.end-item.start else: #the intersecting item's start and end are outside the interval range overlap = interval.end-interval.start if overlap < mincols: overlap_not_met += 1 continue else: #check strand strandMatched = STRAND_INTEGER_VALUES[interval.strand] * STRAND_INTEGER_VALUES[item.other[leftStrandCol]] if (strandMatched == -1 and matchStrand > 0): #needed match but found a complement overlap_not_met += 1 continue if (strandMatched == 1 and matchStrand < 0): #needed complement but found a match overlap_not_met += 1 continue if (strandMatched == 0 and (matchStrand < -1 or matchStrand > 1)): #strict criteria but only permissive match found overlap_not_met += 1 continue #strand criteria met setattr( item, "visited", True ) leftTerm = item.other[leftCol] for col in rightCols: #take each field that's not a number #split it on semicolons, commas, and spaces #output the word and the leftTerm as being associated #curcol = re.sub("\;|\,","\t",interval.fields[col]) curcol= interval.fields[col] lexer = shlex.shlex(curcol) lexer.whitespace='\t\r\n\,\;' lexer.wordchars += ":'" lexer.whitespace_split=True lexer.quotes='"' for item in lexer: item = item.strip() if (item == "."): continue try: float(item) except ValueError: yield [item, leftTerm]