def merge_df_intervals(df, iv_func=lambda iv: iv.merge_hull()): """take a DataFrame {chr, start, end, *} and merge overlapping intervals. * is from the last entry. """ if not "strand" in df.columns: df = df.assign(strand=1) strand_added = True else: strand_added = False joined = _df_to_tup(df) out = [] for chr_strand, sub_group in itertools.groupby(joined, lambda tup: tup[0]): args = [x[1:] for x in sub_group] iv = IntervalSet.from_tuples_with_id(args) new_order = iv_func(iv).to_tuples_last_id() new_df = df.iloc[[x[2] for x in new_order]].copy() new_df.loc[:, "start"] = [x[0] for x in new_order] new_df.loc[:, "stop"] = [x[1] for x in new_order] out.append(new_df) res = pd.concat(out) if strand_added: res = res.drop("strand", axis=1) return res.sort_values(["chr", "start"])
def do_load(): from mbf_nested_intervals import IntervalSet import itertools df = gr.df joined = [] for ii, (chr, start, stop) in enumerate(zip(df["chr"], df["start"], df["stop"])): joined.append(((chr), start, stop, ii)) joined.sort(key=lambda tup: tup[0]) out = [] chr_lengths = gr.genome.get_chromosome_lengths() seen = set() for chr, sub_group in itertools.groupby(joined, lambda tup: tup[0]): args = [x[1:] for x in sub_group] iv = IntervalSet.from_tuples_with_id(args) new_order = iv.invert(0, chr_lengths[chr]).to_numpy() out.append( pd.DataFrame({ "start": new_order[0], "stop": new_order[1], "chr": chr })) seen.add(chr) for chr in chr_lengths.keys() - seen: out.append( pd.DataFrame({ "start": [0], "stop": [chr_lengths[chr]], "chr": chr })) return pd.concat(out).reset_index(drop=True)
def exons_protein_coding_merged(self): """Get the merged exon regions for a gene , only for protein coding exons. Empty result on non protein coding genes result is a a tuple of np arrays, (starts, stops) """ return ( IntervalSet.from_tuples(self._exons_protein_coding).merge_hull().to_numpy() )
def test_invert(self): i = IntervalSet.from_tuples([ (5,10), ]) i2 = i.invert(0, 15) assert i2.to_tuples() == [ (0,5), (10,15)]
def introns(self): """Return [(start, stop),...] for all introns in the transcript Order is in genomic order. Intron is defined as everything inside tss..tes that is not an exon, so if a gene, by any reason would extend beyond it's exons, that region would also be covered. """ gene_start = self.gene.start gene_stop = self.gene.stop exons = sorted(self.exons_tuples) return IntervalSet.from_tuples(exons).invert(gene_start, gene_stop).to_tuples()
def test_merge_hull(self): i = IntervalSet.from_tuples_with_id([ (1,10, 100), (7,15, 200), (0,5, 333), ]) i2 = i.merge_hull() assert i2.to_tuples_with_id() == [ (0, 15, [100, 200, 333]) ]
def test_from_tuples(self): i = IntervalSet.from_tuples([ (1,10), (1,15), (0,5), ]) assert i.to_tuples() == [ (0,5), (1,15), (1,10), ]
def test_from_tuples_with_id2(self): i = IntervalSet.from_tuples_with_id([ (1,10, 100), (1,15, 200), (0,5, 333), ]) assert i.to_tuples_with_id() == [ (0,5, [333]), (1,15, [200]), (1,10, [100]), ]
def introns_strict(self): """Get truly intronic regions - ie. not covered by any exon for this gene result is a a tuple of np arrays, (starts, stops) By it's definition, the introns are disjunct """ gene_start = self.start gene_stop = self.stop exons = [] for tr in self.transcripts: try: exons.extend(tr.exons) except TypeError: # pragma: no cover raise ValueError(f"No exons defined for {tr.transcript_stable_id}") return IntervalSet.from_tuples(exons).invert(gene_start, gene_stop).to_numpy()
def introns_all(self): """Get intronic regions - ie. an intron in any of the transcripts. May contain repetitions and overlaps and is not sorted! """ gene_start = self.start gene_stop = self.stop introns = [], [] for tr in self.transcripts: try: starts, stops = ( IntervalSet.from_tuples(tr.exons) .invert(gene_start, gene_stop) .to_numpy() ) except TypeError: # pragma: no cover raise ValueError(f"No exons defined for {tr.transcript_stable_id}") introns[0].extend(starts) introns[1].extend(stops) return introns
def merge_df_intervals_with_callback(df, callback): """take a {chr, start, end, *} dataframe and merge overlapping intervals, calling callback for group larger than one..""" if not "strand" in df: df = df.assign(strand=1) strand_added = True else: strand_added = False joined = _df_to_tup(df) result = [] for chr, sub_group in itertools.groupby(joined, lambda tup: tup[0]): args = [x[1:] for x in sub_group] iv = IntervalSet.from_tuples_with_id(args) subsets = iv.merge_hull().to_tuples_with_id() for s in subsets: sub_df = df.iloc[list(s[2])].copy() sub_df.at[:, "start"] = s[0] sub_df.at[:, "stop"] = s[1] row_data = callback(sub_df) if not isinstance( row_data, dict ): # and not (isinstance(row_data, pd.core.series.Series) and len(row_data.shape) == 1): print("type", type(row_data)) # print 'len(shape)', len(row_data.shape) print(callback) raise ValueError( "Merge_function returned something other than dict (writing to the pandas series directly is very slow, call to_dict() on it, then modify it.)" ) if set(row_data.keys()) != set(df.columns): raise ValueError( "Merge_function return wrong columns. Expected %s, was %s" % (df.columns, list(row_data.keys())) ) row_data["start"] = s[0] row_data["stop"] = s[1] result.append(row_data) res = pd.DataFrame(result).sort_values(["chr", "start"]) if strand_added: res = res.drop("strand", axis=1) return res
def exons_merged(self): """Get the merged exon regions for a gene given by gene_stable_id result is a a tuple of np arrays, (starts, stops) """ return IntervalSet.from_tuples(self._exons).merge_hull().to_numpy()
def _get_interval_tuples_by_chr(self, genome): from mbf_nested_intervals import IntervalSet coll = {chr: [] for chr in genome.get_chromosome_lengths()} for g in genome.genes.values(): exons = g.exons_overlapping if len(exons[0]) == 0: # pragma: no cover exons = g.exons_merged for start, stop in zip(*exons): coll[g.chr].append( (start, stop, 0b0101 if g.strand == 1 else 0b0110)) for start, stop in zip(*g.introns_strict): coll[g.chr].append( (start, stop, 0b1001 if g.strand == 1 else 0b1010)) result = {} for chr, tups in coll.items(): iset = IntervalSet.from_tuples_with_id(tups) # iset = iset.merge_split() iset = iset.merge_hull() if iset.any_overlapping(): raise NotImplementedError("Should not be reached") result[chr] = [] for start, stop, ids in iset.to_tuples_with_id(): ids = set(ids) if len(ids) == 1: id = list(ids)[0] if id == 0b0101: tag = "exon" strand = +1 elif id == 0b0110: tag = "exon" strand = -1 elif id == 0b1001: tag = "intron" strand = +1 elif id == 0b1010: tag = "intron" strand = -1 else: # pragma: no cover raise NotImplementedError( "Should not be reached") else: down = 0 for i in ids: down |= i if down & 0b1100 == 0b1100: tag = "both" elif down & 0b0100 == 0b0100: tag = "exon" else: # pragma: no cover haven't observed this case in the wild yet. tag = ( # pragma: no cover "intron" # pragma: no cover ) # pragma: no cover haven't observed this case in the wild yet. if down & 0b11 == 0b11: tag += "_undecidable" strand = ( 1 ) # doesn't matter, but must be one or the other elif down & 0b01: strand = 1 else: strand -= 1 result[chr].append((tag, strand, [start], [stop])) return result