def main(args): # TODO: need to create coverage of file if raw # else cluster Scores of binned requires_control = any( n in args for n in ["individual_log2fc_bigwigs", "input_bigwig", "log2fc_bigwig"]) has_control = args.get("control") if requires_control and not has_control: raise Exception("Missing control data!") treatment_ranges = files_to_coverage(args["treatment"], args) if args.get("control"): control_ranges = files_to_coverage(args["control"], args) control_sum = pr.concat(control_ranges.values()) treatment_sum = pr.concat(treatment_ranges.values()) chromsizes = args["chromsizes_"] if args["bigwig"]: path = args["bigwig"] _create_path(path) for name, ranges in treatment_ranges.items(): _basename = splitext(basename(name))[0] bw_name = join(path, _basename + ".bw") ranges.to_bigwig(bw_name, chromsizes) if has_control: for name, ranges in control_ranges.items(): _basename = splitext(basename(name))[0] bw_name = join(path, _basename + ".bw") ranges.to_bigwig(bw_name, chromsizes) if args["individual_log2fc_bigwigs"]: path = args["individual_log2fc_bigwigs"] _create_path(path) for name, ranges in treatment_ranges.items(): _basename = splitext(basename(name))[0] bw_name = join(path, _basename + "_log2fc.bw") ranges.to_bigwig(bw_name, chromsizes, divide_by=control_sum) if args["log2fc_bigwig"]: path = args["log2fc_bigwig"] _create_path(dirname(path)) treatment_sum.to_bigwig(path, chromsizes, divide_by=control_sum) if args["chip_bigwig"]: path = args["chip_bigwig"] _create_path(dirname(path)) treatment_sum.to_bigwig(path, chromsizes) if args["input_bigwig"]: path = args["input_bigwig"] _create_path(dirname(path)) control_sum.to_bigwig(path, chromsizes)
def main(args): # TODO: need to create coverage of file if raw # else cluster Scores of binned treatment_ranges = files_to_coverage(args["treatment"], args) print(treatment_ranges) if args.get("control"): control_ranges = files_to_coverage(args["control"], args) control_sum = pr.concat(control_ranges.values()) treatment_sum = pr.concat(treatment_ranges.values()) print(treatment_sum)
def test_cluster_by(gr, strand): result = gr.cluster(by="ID", strand=strand).df print(result) df = gr.df if strand: groupby = ["Chromosome", "Strand", "ID"] else: groupby = ["Chromosome", "ID"] grs = [] for _, gdf in natsorted(df.groupby(groupby)): grs.append(pr.PyRanges(gdf)) clusters = [gr.cluster(strand=strand) for gr in grs] i = 1 new_clusters = [] for c in clusters: print("c") print(c) c.Cluster = i i += 1 new_clusters.append(c) expected = pr.concat(new_clusters).df expected.loc[:, "Cluster"] = expected.Cluster.astype(np.int32) # expected = expected.drop_duplicates() print(expected) print(result) assert_df_equal(result, expected)
def set_union(self, other, **kwargs): kwargs = fill_kwargs(kwargs) strandedness = kwargs["strandedness"] strand = True if strandedness else False gr = pr.concat([self, other], strand) gr = gr.merge(strand=strand, **kwargs) return gr
def unstrand(self): if not self.stranded: return self gr = pr.concat([self["+"], self["-"]]) gr = gr.apply(lambda df: df.drop("Strand", axis=1)) return gr
def unstrand(self): if not self.stranded: return self gr = pr.concat([self["+"], self["-"]]) gr = gr.drop("Strand", drop_strand=True) return gr
def unstrand(self): if not self.stranded: return self gr = pr.concat([self["+"], self["-"]]) gr = gr.apply(lambda df: df.drop("Strand", axis=1).reset_index(drop= True)) return pr.PyRanges(gr.dfs)
def assert_equal_length_before_after(gr1, gr2): print("in test") l1 = len(gr1) l2 = len(gr2) c = pr.concat([gr1, gr2]) if not gr1.stranded or not gr2.stranded: assert not c.stranded lc = len(c) assert l1 + l2 == lc
def update_pr(changed_id, removed_id): global ERVs #print("{}\t{}".format(changed_id, removed_id)) new_elem = elements[changed_id].span().pr() new_elem.ID = changed_id new_elem.Struct = elements[changed_id].meta_str() #print(new_elem) print("Merging {} into {}".format(removed_id, changed_id)) ERVs = pr.concat([ pr.PyRanges( ERVs.df.loc[~ERVs.df['ID'].isin([changed_id, removed_id])]), new_elem ])
def to_example(self, nrows=10): nrows_half = int(min(nrows, len(self))/2) if nrows < len(self): first = self.head(nrows_half) last = self.tail(nrows_half) example = pr.concat([first, last]) else: example = self d = {c: list(getattr(example, c)) for c in example.columns} return d
def unstrand(self): # from pydbg import dbg # dbg(self.stranded) if not self.stranded: return self gr = pr.concat([self["+"], self["-"]]) # dbg(gr) gr = gr.drop("Strand", drop_strand=True) # dbg(gr) return gr
def get_target_proximal_ranges(self): downstream = self.get_target_ranges() upstream = self.get_target_ranges() upstream.End = upstream.Start - 1 upstream.Start += -(self.get_target_proximity() + 1) # there may be edge exceptions where the Start coordinate < 0? downstream.Start = downstream.End + 1 downstream.End += self.get_target_proximity() + 1 # there may be edge exceptions where End coordinate drops off chromo. merged = pr.concat([upstream, downstream]) return pr.gf.genome_bounds(merged, self.ref.get_reference_ranges(), clip=True)
def test_merge_by(gr, strand): print(gr) result = gr.merge(by="ID").df.drop("ID", axis=1) df = gr.df grs = [] for _, gdf in df.groupby("ID"): grs.append(pr.PyRanges(gdf)) expected = pr.concat([gr.merge() for gr in grs]).df print(expected) print(result) assert_df_equal(result, expected)
def lojs_overlap(feature_files, compare_pr): """ Function to run left outer join in features to all_regions_file Args: :param feature_files: list of paths to file to run intersection with all_regions_file :param compare_pr: pyranges object containing all regions of interest. Should have column 'idx'. Added in function epitome.functions.bed2Pyranges. :return arr: array same size as the number of genomic regions in all_regions_file """ if len(feature_files) == 0: logger.warn("WARN: lojs_overlap failed for all files %s with 0 lines" % ','.join(feature_files)) return np.zeros(len(compare_pr)) #### Number of files that must share a consensus #### if len(feature_files) <= 2: n = 1 # if there are 1-2 files just include all elif len(feature_files) >= 3 and len(feature_files) <= 7: n = 2 else: n = int(len(feature_files) / 4) # in 25% of files # Very slow: concatenate all bed files and only take regions with n overlap group_pr = pr.concat([pr.read_bed(i).merge() for i in feature_files]) group_pr = group_pr.merge(count=True).df group_pr = group_pr[group_pr['Count'] >= n] # Remove count column and save to bed file group_pr.drop('Count', inplace=True, axis=1) type_ = (compare_pr.Start.dtype == 'int64') pr1 = pr.PyRanges(group_pr, int64=type_) intersected = compare_pr.count_overlaps(pr1) arr = intersected.df.sort_values(by='idx')['NumberOverlaps'].values arr[arr > 0] = 1 return arr
def set_union(self, other, **kwargs): kwargs = fill_kwargs(kwargs) strandedness = kwargs["strandedness"] strand = True if strandedness else False if not strand: self = self.unstrand() other = other.unstrand() # from pydbg import dbg # dbg(self) # dbg(other) gr = pr.concat([self, other], strand) # from pydbg import dbg # dbg(gr) gr = gr.merge(strand=strand, **kwargs) # dbg(gr) return gr
def count_overlaps(grs, features=None, how=None, nb_cpu=1, strandedness=None): if features is None: features = pr.concat(grs.values()).split() from pyranges.methods.intersection import _count_overlaps hits_gr = {} for name, gr in grs.items(): gr = gr.drop() res = features.apply_pair(gr, _count_overlaps, as_pyranges=False, nb_cpu=nb_cpu, strandedness=strandedness) setattr(features, name, res) setattr(features, name, getattr(features, name).fillna(0)) return features
def parse_bed_files(bed_files): """Creates PyRanges objects from the BED files.""" # Skip if no BED files are provided if len(bed_files) == 0: return # Load BED files beds = [pr.read_bed(b) for b in bed_files] # Check that all BED files have the first four columns for bed_file, bed in zip(bed_files, beds): assert "Name" in bed.columns, f"Name (column 4) missing from {bed_file}." # Concatenate BED files and only keep Name column bed = pr.concat(beds) bed = bed.unstrand() bed = bed[["Name"]] # Ensure unique names assert bed.Name.is_unique, "Names (column 4) not unique across BED files." return bed
def mcc(grs, genome, labels=None, strand=False, verbose=False): import sys try: genome_length = int(genome) except (TypeError, ValueError): genome_length = int(genome.End.sum()) from itertools import combinations_with_replacement, chain if labels is None: _labels = list(range(len(grs))) _labels = combinations_with_replacement(_labels, r=2) else: assert len(labels) == len(grs) _labels = combinations_with_replacement(labels, r=2) if verbose: # check that genome definition does not have many more # chromosomes than datafiles gr_cs = set(chain(*[gr.chromosomes for gr in grs])) g_cs = set(genome.chromosomes) surplus = g_cs - gr_cs if len(surplus): print( "The following chromosomes are in the genome, but not the PyRanges:", ", ".join(surplus), file=sys.stderr) # remove all non-loc columns before computation grs = [gr.merge(strand=strand) for gr in grs] if strand: def make_stranded(df): df = df.copy() df2 = df.copy() df.insert(df.shape[1], "Strand", "+") df2.insert(df2.shape[1], "Strand", "-") return pd.concat([df, df2]) genome = genome.apply(make_stranded) strandedness = "same" if strand else None rowdicts = [] for (lt, lf), (t, f) in zip(_labels, combinations_with_replacement(grs, r=2)): if verbose: print(lt, lf, file=sys.stderr) if lt == lf: if not strand: tp = t.length fn = 0 tn = genome.length - tp fp = 0 rowdicts.append({ "T": lt, "F": lf, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": 1 }) else: for strand in "+ -".split(): tp = t[strand].length fn = 0 tn = genome_length - tp fp = 0 rowdicts.append({ "T": lt, "F": lf, "Strand": strand, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": 1 }) continue else: c = pr.concat([t, f]).merge(strand=strand) j = t.join(f, strandedness=strandedness) tp_gr = j.new_position("intersection").merge(strand=strand) if strand: for strand in "+ -".split(): tp = tp_gr[strand].length fp = f[strand].length - tp fn = t[strand].length - tp tn = genome_length - c[strand].length mcc = _mcc(tp, fp, tn, fn) rowdicts.append({ "T": lt, "F": lf, "Strand": strand, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": mcc }) rowdicts.append({ "T": lf, "F": lt, "Strand": strand, "TP": tp, "FP": fn, "TN": tn, "FN": fp, "MCC": mcc }) else: tp = tp_gr.length fp = f.length - tp fn = t.length - tp tn = genome_length - c.length mcc = _mcc(tp, fp, tn, fn) rowdicts.append({ "T": lt, "F": lf, "TP": tp, "FP": fp, "TN": tn, "FN": fn, "MCC": mcc }) rowdicts.append({ "T": lf, "F": lt, "TP": tp, "FP": fn, "TN": tn, "FN": fp, "MCC": mcc }) df = pd.DataFrame.from_dict(rowdicts).sort_values(["T", "F"]) return df
total_sizes = [] cds_sizes = [] exon_sizes = [] panel_prs = [] for panel in panels: print(panel) panel_pr = pr.PyRanges(genie.loc[(genie['SEQ_ASSAY_ID'] == panel) & genie['Chromosome'].isin(chromosomes), 'Chromosome':'End_Position'].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'})).merge() total_sizes.append(sum([i + 1 for i in panel_pr.lengths()])) cds_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_cds_pr).lengths()])) exon_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_exon_pr).lengths()])) panel_prs.append(panel_pr) grs = {k: v for k, v in zip(['CDS', 'exon'] + list(panels), [gff_cds_pr, gff_exon_pr] + panel_prs)} result = pr.count_overlaps(grs, pr.concat({'maf': maf_pr}.values())) result = result.df tcga_maf = pd.merge(tcga_maf, result.iloc[:, 3:], how='left', on='index') panel_df['total'] = total_sizes panel_df['cds'] = cds_sizes panel_df['exon'] = exon_sizes ##get assumed size of the most common kit: https://bitbucket.org/cghub/cghub-capture-kit-info/src/master/BI/vendor/Agilent/whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed agilent_df = pd.read_csv(file_path / 'whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed', sep='\t', low_memory=False, header=None) kit_pr = pr.PyRanges(agilent_df.rename(columns={0: 'Chromosome', 1: 'Start', 2: 'End'})).merge() kit_total = sum([i + 1 for i in kit_pr.lengths()]) kit_cds = sum([i + 1 for i in kit_pr.intersect(gff_cds_pr).merge().lengths()]) kit_exon = sum([i + 1 for i in kit_pr.intersect(gff_exon_pr).merge().lengths()])
def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1): """Count overlaps in multiple pyranges. Parameters ---------- grs : dict of PyRanges The PyRanges to use as queries. features : PyRanges, default None The PyRanges to use as subject in the query. If None, the PyRanges themselves are used as a query. strandedness : {None, "same", "opposite", False}, default None, i.e. auto Whether to compare PyRanges on the same strand, the opposite or ignore strand information. The default, None, means use "same" if both PyRanges are strande, otherwise ignore the strand information. how : {None, "all", "containment"}, default None, i.e. all What intervals to report. By default reports all overlapping intervals. "containment" reports intervals where the overlapping is contained within it. nb_cpu : int, default 1 How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. Will only lead to speedups on large datasets. Examples -------- >>> a = '''Chromosome Start End ... chr1 6 12 ... chr1 10 20 ... chr1 22 27 ... chr1 24 30''' >>> b = '''Chromosome Start End ... chr1 12 32 ... chr1 14 30''' >>> c = '''Chromosome Start End ... chr1 8 15 ... chr1 10 14 ... chr1 32 34''' >>> grs = {n: pr.from_string(s) for n, s in zip(["a", "b", "c"], [a, b, c])} >>> for k, v in grs.items(): ... print("Name: " + k) ... print(v) Name: a +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 6 | 12 | | chr1 | 10 | 20 | | chr1 | 22 | 27 | | chr1 | 24 | 30 | +--------------+-----------+-----------+ Unstranded PyRanges object has 4 rows and 3 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. Name: b +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 12 | 32 | | chr1 | 14 | 30 | +--------------+-----------+-----------+ Unstranded PyRanges object has 2 rows and 3 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. Name: c +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 8 | 15 | | chr1 | 10 | 14 | | chr1 | 32 | 34 | +--------------+-----------+-----------+ Unstranded PyRanges object has 3 rows and 3 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. >>> pr.count_overlaps(grs) +--------------+-----------+-----------+-----------+-----------+-----------+ | Chromosome | Start | End | a | b | c | | (object) | (int32) | (int32) | (int32) | (int32) | (int32) | |--------------+-----------+-----------+-----------+-----------+-----------| | chr1 | 6 | 8 | 1 | 0 | 0 | | chr1 | 8 | 10 | 1 | 0 | 1 | | chr1 | 10 | 12 | 2 | 0 | 2 | | chr1 | 12 | 14 | 1 | 1 | 2 | | ... | ... | ... | ... | ... | ... | | chr1 | 24 | 27 | 2 | 2 | 0 | | chr1 | 27 | 30 | 1 | 2 | 0 | | chr1 | 30 | 32 | 0 | 1 | 0 | | chr1 | 32 | 34 | 0 | 0 | 1 | +--------------+-----------+-----------+-----------+-----------+-----------+ Unstranded PyRanges object has 12 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. >>> gr = pr.PyRanges(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40]) >>> gr +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 0 | 10 | | chr1 | 10 | 20 | | chr1 | 20 | 30 | | chr1 | 30 | 40 | +--------------+-----------+-----------+ Unstranded PyRanges object has 4 rows and 3 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. >>> pr.count_overlaps(grs, gr) +--------------+-----------+-----------+-----------+-----------+-----------+ | Chromosome | Start | End | a | b | c | | (category) | (int32) | (int32) | (int32) | (int32) | (int32) | |--------------+-----------+-----------+-----------+-----------+-----------| | chr1 | 0 | 10 | 1 | 0 | 1 | | chr1 | 10 | 20 | 2 | 2 | 2 | | chr1 | 20 | 30 | 2 | 2 | 0 | | chr1 | 30 | 40 | 0 | 1 | 1 | +--------------+-----------+-----------+-----------+-----------+-----------+ Unstranded PyRanges object has 4 rows and 6 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome. """ kwargs = { "as_pyranges": False, "nb_cpu": nb_cpu, "strandedness": strandedness, "how": how, "nb_cpu": nb_cpu } names = list(grs.keys()) if features is None: features = pr.concat(grs.values()).split(between=True) from pyranges.methods.intersection import _count_overlaps for name, gr in grs.items(): gr = gr.drop() res = features.apply_pair(gr, _count_overlaps, **kwargs) setattr(features, name, res) setattr(features, name, getattr(features, name).fillna(0)) def to_int(df): df.loc[:, names] = df[names].astype(np.int32) return df features = features.apply(to_int) return features
def k_nearest(self, other, k=1, **kwargs): from pyranges.methods.k_nearest import _nearest from sorted_nearest import get_all_ties, get_different_ties kwargs = fill_kwargs(kwargs) kwargs["stranded"] = self.stranded and other.stranded overlap = kwargs.get("overlap", True) ties = kwargs.get("ties", False) self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()}) try: # if k is an array k = k.values except: pass self.__k__ = k self.__IX__ = np.arange(len(self)) # from time import time # start = time() dfs = pyrange_apply(_nearest, self, other, **kwargs) # end = time() # print("nearest", end - start) nearest = PyRanges(dfs) # nearest.msp() # raise # print("nearest len", len(nearest)) if not overlap: # self = self.drop(like="__k__|__IX__") result = nearest#.drop(like="__k__|__IX__") else: from collections import defaultdict overlap_kwargs = {k: v for k, v in kwargs.items()} # print("kwargs ties:", kwargs.get("ties")) overlap_kwargs["how"] = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")] # start = time() overlaps = self.join(other, **overlap_kwargs) # end = time() # print("overlaps", end - start) overlaps.Distance = 0 # print("overlaps len", len(overlaps)) result = pr.concat([overlaps, nearest]) if not len(result): return pr.PyRanges() # print(result) # print(overlaps.drop(like="__").df) # raise # start = time() new_result = {} if ties in ["first", "last"]: # method = "tail" if ties == "last" else "head" # keep = "last" if ties == "last" else "first" for c, df in result: # start = time() # print(c) # print(df) df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) dfs = [] for k, kdf in grpby: # print("k", k) # print(kdf) # dist_bool = ~kdf.Distance.duplicated(keep=keep) # print(dist_bool) # kdf = kdf[dist_bool] grpby2 = kdf.groupby("__IX__", sort=False) # f = getattr(grpby2, method) _df = grpby2.head(k) # print(_df) dfs.append(_df) # raise if dfs: new_result[c] = pd.concat(dfs) # print(new_result[c]) elif ties == "different" or not ties: for c, df in result: # print(df) if df.empty: continue dfs = [] df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) # for each index # want to keep until we have k # then keep all with same distance for k, kdf in grpby: # print("kdf " * 10) # print("k " * 5, k) # print(kdf["__IX__ Distance".split()]) # print(kdf.dtypes) # print(kdf.index.dtypes) # if ties: if ties: lx = get_different_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) else: lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) # print(lx) # else: # lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) _df = kdf.reindex(lx) # print("_df", _df) dfs.append(_df) if dfs: new_result[c] = pd.concat(dfs) result = pr.PyRanges(new_result) if not result.__IX__.is_monotonic: result = result.sort("__IX__") result = result.drop(like="__IX__|__k__") self = self.drop(like="__k__|__IX__") def prev_to_neg(df, kwargs): strand = df.Strand.iloc[0] if "Strand" in df else "+" suffix = kwargs["suffix"] bools = df["End" + suffix] < df.Start if not strand == "+": bools = ~bools df.loc[bools, "Distance"] = -df.loc[bools, "Distance"] return df # print(result) result = result.apply(prev_to_neg, suffix=kwargs["suffix"]) # print(result) # end = time() # print("final stuff", end - start) return result