def expected_result_single_chromosome(chipseq_dataset): return pr.PyRanges(chipseq_dataset["chr2"].df.drop_duplicates())
def get_reference_ranges(self): chromosomes = self.get_chromosomes() starts = np.repeat(0, len(chromosomes)) ends = self.get_chromosome_lengths(chromosomes) return pr.PyRanges(chromosomes=chromosomes, starts=starts, ends=ends)
def pyrange_apply(function, self, other, **kwargs): nparams = get_n_args(function) nb_cpu = kwargs.get("nb_cpu", 1) if nb_cpu > 1: import ray with suppress_stdout_stderr(): ray.init(num_cpus=nb_cpu, ignore_reinit_error=True) function, get, _merge_dfs = get_multithreaded_funcs(function, nb_cpu=nb_cpu) strandedness = kwargs["strandedness"] other_strand = {"+": "-", "-": "+"} same_strand = {"+": "+", "-": "-"} if strandedness == "opposite": strand_dict = other_strand else: strand_dict = same_strand assert strandedness in ["same", "opposite", False, None] if strandedness: assert self.stranded and other.stranded, \ "Can only do stranded operations when both PyRanges contain strand info" results = [] items = natsorted(self.dfs.items()) keys = natsorted(self.dfs.keys()) if strandedness: for (c, s), df in items: os = strand_dict[s] if not (c, os) in other.keys() or len(other[c, os].values()) == 0: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other[c, os].values()[0] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, nparams, df, odf, kwargs) results.append(result) else: if self.stranded and not other.stranded: for (c, s), df in items: if not c in other.chromosomes: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other.dfs[c] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, nparams, df, odf, kwargs) results.append(result) elif not self.stranded and other.stranded: for c, df in items: if not c in other.chromosomes: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf1 = other[c, "+"].df odf2 = other[c, "-"].df odf = _merge_dfs.remote(odf1, odf2) df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, nparams, df, odf, kwargs) results.append(result) elif self.stranded and other.stranded: for (c, s), df in self.items(): if not c in other.chromosomes: odfs = pr.PyRanges( pd.DataFrame(columns="Chromosome Start End".split())) else: odfs = other[c].values() # from pydbg import dbg # dbg(odfs) if len(odfs) == 2: odf = _merge_dfs.remote(*odfs) elif len(odfs) == 1: odf = odfs[0] else: odf = pd.DataFrame(columns="Chromosome Start End".split()) df, odf = make_binary_sparse(kwargs, df, odf) # dbg(df) # dbg(odf) result = call_f(function, nparams, df, odf, kwargs) results.append(result) else: for c, df in items: if not c in other.chromosomes: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other.dfs[c] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, nparams, df, odf, kwargs) results.append(result) results = get(results) results = process_results(results, keys) if nb_cpu > 1: ray.shutdown() return results
def introns(self, by="gene", nb_cpu=1): """Return the introns. Parameters ---------- by : str, {"gene", "transcript"}, default "gene" Whether to find introns per gene or transcript. nb_cpu: int, default 1 How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. Will only lead to speedups on large datasets. See Also -------- pyranges.genomicfeatures.GenomicFeaturesMethods.tss : return the transcription start sites Examples -------- >>> gr = pr.data.ensembl_gtf() >>> gr +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | | (category) | (object) | (category) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------| | 1 | havana | gene | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | transcript | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | exon | 11868 | 12227 | . | + | . | transcribed_unprocessed_pseudogene | ... | | 1 | havana | exon | 12612 | 12721 | . | + | . | transcribed_unprocessed_pseudogene | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | gene | 1173055 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | transcript | 1173055 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | exon | 1179364 | 1179555 | . | - | . | lncRNA | ... | | 1 | havana | exon | 1173055 | 1176396 | . | - | . | lncRNA | ... | +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.) >>> gr.features.introns(by="gene") +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | +20 | | (object) | (object) | (object) | (int32) | (int32) | (object) | (category) | (object) | ... | |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------| | 1 | ensembl_havana | intron | 1173926 | 1174265 | . | + | . | ... | | 1 | ensembl_havana | intron | 1174321 | 1174423 | . | + | . | ... | | 1 | ensembl_havana | intron | 1174489 | 1174520 | . | + | . | ... | | 1 | ensembl_havana | intron | 1175034 | 1179188 | . | + | . | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | intron | 874591 | 875046 | . | - | . | ... | | 1 | havana | intron | 875155 | 875525 | . | - | . | ... | | 1 | havana | intron | 875625 | 876526 | . | - | . | ... | | 1 | havana | intron | 876611 | 876754 | . | - | . | ... | +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+-------+ Stranded PyRanges object has 311 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 20 hidden columns: gene_biotype, gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, ... (+ 11 more.) >>> gr.features.introns(by="transcript") +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+ | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | | (object) | (object) | (object) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | |--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------| | 1 | havana | intron | 818202 | 818722 | . | + | . | lncRNA | ... | | 1 | ensembl_havana | intron | 960800 | 961292 | . | + | . | protein_coding | ... | | 1 | ensembl_havana | intron | 961552 | 961628 | . | + | . | protein_coding | ... | | 1 | ensembl_havana | intron | 961750 | 961825 | . | + | . | protein_coding | ... | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | | 1 | havana | intron | 732207 | 732980 | . | - | . | transcribed_processed_pseudogene | ... | | 1 | havana_tagene | intron | 168165 | 169048 | . | - | . | lncRNA | ... | | 1 | havana_tagene | intron | 165942 | 167958 | . | - | . | lncRNA | ... | | 1 | havana_tagene | intron | 168165 | 169048 | . | - | . | lncRNA | ... | +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+----------------------------------+-------+ Stranded PyRanges object has 1,043 rows and 28 columns from 1 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.) """ kwargs = {"by": by, "nb_cpu": nb_cpu} kwargs = pr.pyranges.fill_kwargs(kwargs) assert by in ["gene", "transcript"] id_column = by_to_id[by] gr = self.pr.sort(id_column) if not len(gr): return pr.PyRanges() exons = gr.subset(lambda df: df.Feature == "exon") exons = exons.merge(by=id_column) by_gr = gr.subset(lambda df: df.Feature == by) result = pyrange_apply(_introns2, by_gr, exons, **kwargs) return pr.PyRanges(result)
chromosomes = {} for i in list(range(1, 23))+['X', 'Y']: with open(file_path / 'chromosomes' / ('chr' + str(i) + '.txt')) as f: chromosomes[str(i)] = f.read() ##Use GFF3 to annotate variants ##ftp://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/ gff = pd.read_csv(file_path / 'Homo_sapiens.GRCh37.87.gff3', sep='\t', names=['chr', 'unknown', 'gene_part', 'start', 'end', 'unknown2', 'strand', 'unknown3', 'gene_info'], usecols=['chr','gene_part', 'start', 'end', 'gene_info'], low_memory=False) gff_cds_pr = pr.PyRanges(gff.loc[(gff['gene_part'] == 'CDS') & gff['chr'].isin(chromosomes), ['chr', 'start', 'end', 'gene_info']].astype({'start': int, 'end': int}).rename(columns={'chr': 'Chromosome', 'start': 'Start', 'end': 'End'})).merge() gff_exon_pr = pr.PyRanges(gff.loc[(gff['gene_part'] == 'exon') & gff['chr'].isin(chromosomes), ['chr', 'start', 'end', 'gene_info']].astype({'start': int, 'end': int}).rename(columns={'chr': 'Chromosome', 'start': 'Start', 'end': 'End'})).merge() del gff ##make index column for merging tcga_maf['index'] = tcga_maf.index.values maf_pr = pr.PyRanges(tcga_maf.loc[:, ['Chromosome', 'Start_Position', 'End_Position', 'index']].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'})) ##use the genie 7.0 panels: https://www.synapse.org/#!Synapse:syn21551261 genie = pd.read_csv(file_path / 'genomic_information.txt', sep='\t', low_memory=False) panels = genie.SEQ_ASSAY_ID.unique() panel_df = pd.DataFrame(data=panels, columns=['Panel']) total_sizes = []
def range_overlap(user_bed_path, guide_loc_path, output_name, upstream, downstream, sort_by, de_novo, cloning_strategy): #note: if de_novo = True, then sort_by is automatically set to be "mismatch_score" import sys import pyranges as pr import pandas as pd from targetsite_to_primers import revcomp, startG, cloning_parameters, targetsite_to_primers ####################### # read in the bed file from the user (right now this will be a file of TSS's for a specific gene list from Mina's function) ####################### user_bed = pd.read_csv( user_bed_path, sep='\t', header=None ) # important: this assumes that the bed file has no column names # need to have first three columns be called 'Chromosome', 'Start', 'End' for a pyRanges object so here we will change the column names column_names_user = user_bed.columns.values column_names_user_list = list(column_names_user) column_names_user_list_str = [str(i) for i in column_names_user_list] column_names_user_list_str[0:6] = [ 'Chromosome', 'Start', 'End', 'Gene', '5', "Strand" ] user_bed.columns = column_names_user_list_str # iterate over the rows and change the start and end positions in the user bed file based on the upstream and downstream arguments # also et the start column as a list to use later to determine the distance from TSS # we also need to account for the beginning of chromosomes where the upstream/downstream will give a negative location, so we set those values to zero of it will span over the end of the chromosome user_bed_start = [] for index, row in user_bed.iterrows(): if user_bed.at[index, 'Strand'] == '-': user_bed_start.append(user_bed.at[index, 'Start']) if user_bed.at[index, 'Start'] < downstream: user_bed.at[index, 'Start'] = 0 user_bed.at[index, 'End'] += upstream else: user_bed.at[index, 'End'] += upstream user_bed.at[index, 'Start'] -= downstream else: user_bed_start.append(user_bed.at[index, 'End']) if user_bed.at[index, 'Start'] < upstream: user_bed.at[index, 'Start'] = 0 user_bed.at[index, 'End'] += downstream else: user_bed.at[index, 'Start'] -= upstream user_bed.at[index, 'End'] += downstream user_bed = user_bed.assign(Original_start=user_bed_start) user_bed_pyR = pr.PyRanges( user_bed ) # convert the panda df to a pyRanges object which is required for the overlap function #user_bed_pyR_merge = user_bed_pyR.merge() # if we wanted to collapse overlapping ranges, we could use this, but removed it becuase we lose gene column when we do it ############## # read in the guides already determined for human genome ############### guide_locs = pd.read_csv( guide_loc_path, sep='\t' ) # important: this assumes that the guide table has column names, need to change if this isnt true! column_names_gloc = guide_locs.columns.values column_names_gloc_list = list(column_names_gloc) column_names_gloc_list_str = [str(i) for i in column_names_gloc_list] # right now the scores are not in the best form e.g. score(perc), so we need to make them separate numberic columns for sorting column_names_gloc_list_str[0:3] = ['Chromosome', 'Start', 'End'] guide_locs.columns = column_names_gloc_list_str guide_locs_noNaN = guide_locs.fillna(0) if de_novo == False: guide_locs_noNaN[['Doench2016_perc', 'Doench2016_score' ]] = guide_locs_noNaN.fusi.str.split('%', expand=True) guide_locs_noNaN['Doench2016_score'] = guide_locs_noNaN[ 'Doench2016_score'].str.replace(r"[\(\)]", "") guide_locs_noNaN[[ 'Moreno_Matos_perc', 'Moreno_Matos_score' ]] = guide_locs_noNaN.crisprScan.str.split('%', expand=True) guide_locs_noNaN['Moreno_Matos_score'] = guide_locs_noNaN[ 'Moreno_Matos_score'].str.replace(r"[\(\)]", "") guide_locs_noNaN[['MIT_specificity']] = guide_locs_noNaN[['scoreDesc']] guide_locs_noNaN['MIT_specificity'] = guide_locs_noNaN[ 'MIT_specificity'].str.replace(r"[A-Za-z\s\.\-]+", "0") # to sort the scores, we need to make the columns numeric guide_locs_noNaN[[ "Doench2016_perc", "Doench2016_score", "Moreno_Matos_perc", "Moreno_Matos_score", "MIT_specificity" ]] = guide_locs_noNaN[[ "Doench2016_perc", "Doench2016_score", "Moreno_Matos_perc", "Moreno_Matos_score", "MIT_specificity" ]].apply(pd.to_numeric) # remove unnescessary columns guide_locs_noNaN_select = guide_locs_noNaN.iloc[:, [ 0, 1, 2, 3, 4, 5, 6, 7, 11, 20, 21, 22, 23, 24 ]] # check whether the scoring entry from the user is incorrect, if so, then it sets the default and gives a message what was done if sort_by not in [ 'Doench2016_perc', 'Doench2016_score', 'Moreno_Matos_perc', 'Moreno_Matos_score', 'MIT_specificity' ]: print( "The sort_by argument must be 'Doench2016_perc', 'Doench2016_score', 'Moreno_Matos_perc', 'Moreno_Matos_score', 'MIT_specificity', because you failed to comply the default ('Doench2016_perc') will be used" ) sort_by = "Doench2016_perc" # convert the guide panda to a pyRanges table guide_locs_pyR = pr.PyRanges(guide_locs_noNaN_select) else: # if de_novo is true then it just sets the sort by to the column z and b made for scoring sort_by = "mismatch_score" guide_locs_pyR = pr.PyRanges( guide_locs_noNaN) # convert the guide panda to a pyRanges table class NoOverlapError(Exception): pass if len(guide_locs_pyR.overlap(user_bed_pyR)) == 0: raise NoOverlapError( "There are no overlaps between the user supplied ranges and the gRNAs!" ) guide_locs_pyR_overlap = guide_locs_pyR.overlap( user_bed_pyR) # run the pyRanges overlap function # convert to pandas with this loop to more easily manipulate the df, got this code from internet for k, guide_locs_pyR_overlap_df in guide_locs_pyR_overlap: guide_locs_pyR_overlap_df # here we will iterate over the rows of the overlap table and get the start values for each row in that panda df. Then we compare that value with the start and end of each row in the original user bed to determine whether it falls in the range of that row so we can then pull the gene and the original TSS start site for that gene. These values are written into a list which will then be used later on to make new columns in the table. gene_list = [] user_bed_start_ol = [] for index_ol, row_ol in guide_locs_pyR_overlap_df.iterrows(): for index_ub, row_ub in user_bed.iterrows(): if row_ol[0] == row_ub[0]: if row_ol[1] in range(row_ub[1], (row_ub[2] + 1)) or row_ol[2] in range( row_ub[1], (row_ub[2] + 1)): gene_list.append(row_ub[3]) user_bed_start_ol.append(row_ub[6]) # add in the list of genes and the list of the original start sites that were just compiled, and then sort by gene followed by score guide_locs_pyR_overlap_df = guide_locs_pyR_overlap_df.assign( Gene=gene_list) guide_locs_pyR_overlap_df = guide_locs_pyR_overlap_df.assign( Original_Start=user_bed_start_ol) guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df.sort_values( ["Gene", sort_by], ascending=[True, False]) # this for loop both calculates the distance of the end of each guide from the TSS of the gene for that row and also compiles the lists of each primer primer1_list = [] primer2_list = [] distance_from_tss = [] for index, row in guide_locs_pyR_overlap_df_sort.iterrows(): # get a list of distances from the end og each guide to the original TSS distance_temp = guide_locs_pyR_overlap_df_sort.at[ index, 'End'] - guide_locs_pyR_overlap_df_sort.at[index, 'Original_Start'] distance_from_tss.append(distance_temp) # use Minas functions to get lists of each primer gRNA = guide_locs_pyR_overlap_df_sort.at[index, 'guideSeq'] (Grequired, p1specs, p2specs) = cloning_parameters(cloning_strategy) (primer1, primer2) = targetsite_to_primers(gRNA, cloning_strategy, Grequired, p1specs, p2specs) primer1_list.append(primer1) primer2_list.append(primer2) # add the various columns to the df that we populated in the above for loop guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign( GuideEnd_to_TSS=distance_from_tss) guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.drop( columns=["Original_Start"]) if de_novo == False: guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.drop( columns=["name", "score", "thickStart", "thickEnd"]) # clean up some unnecessary columns guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign( Primer1=primer1_list) guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.assign( Primer2=primer2_list) col_name_p1 = 'Primer1 - ' + cloning_strategy col_name_p2 = 'Primer2 - ' + cloning_strategy # rename the primer columns so that it tells you the strategy you used guide_locs_pyR_overlap_df_sort = guide_locs_pyR_overlap_df_sort.rename( columns={ "Primer1": col_name_p1, "Primer2": col_name_p2 }) guide_locs_pyR_overlap_df_sort.to_csv((output_name + '.txt'), sep='\t', header=True, index=False) return (guide_locs_pyR_overlap_df_sort)
def k_nearest(self, other, k=1, **kwargs): from pyranges.methods.k_nearest import _nearest from sorted_nearest import get_all_ties, get_different_ties kwargs = fill_kwargs(kwargs) kwargs["stranded"] = self.stranded and other.stranded overlap = kwargs.get("overlap", True) ties = kwargs.get("ties", False) self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()}) try: # if k is an array k = k.values except: pass self.__k__ = k self.__IX__ = np.arange(len(self)) # from time import time # start = time() dfs = pyrange_apply(_nearest, self, other, **kwargs) # end = time() # print("nearest", end - start) nearest = PyRanges(dfs) # nearest.msp() # raise # print("nearest len", len(nearest)) if not overlap: # self = self.drop(like="__k__|__IX__") result = nearest #.drop(like="__k__|__IX__") else: from collections import defaultdict overlap_kwargs = {k: v for k, v in kwargs.items()} # print("kwargs ties:", kwargs.get("ties")) overlap_kwargs["how"] = defaultdict(lambda: None, { "first": "first", "last": "last" })[kwargs.get("ties")] # start = time() overlaps = self.join(other, **overlap_kwargs) # end = time() # print("overlaps", end - start) overlaps.Distance = 0 # print("overlaps len", len(overlaps)) result = pr.concat([overlaps, nearest]) if not len(result): return pr.PyRanges() # print(result) # print(overlaps.drop(like="__").df) # raise # start = time() new_result = {} if ties in ["first", "last"]: # method = "tail" if ties == "last" else "head" # keep = "last" if ties == "last" else "first" for c, df in result: # start = time() # print(c) # print(df) df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) dfs = [] for k, kdf in grpby: # print("k", k) # print(kdf) # dist_bool = ~kdf.Distance.duplicated(keep=keep) # print(dist_bool) # kdf = kdf[dist_bool] grpby2 = kdf.groupby("__IX__", sort=False) # f = getattr(grpby2, method) _df = grpby2.head(k) # print(_df) dfs.append(_df) # raise if dfs: new_result[c] = pd.concat(dfs) # print(new_result[c]) elif ties == "different" or not ties: for c, df in result: # print(df) if df.empty: continue dfs = [] df = df.sort_values(["__IX__", "Distance"]) grpby = df.groupby("__k__", sort=False) # for each index # want to keep until we have k # then keep all with same distance for k, kdf in grpby: # print("kdf " * 10) # print("k " * 5, k) # print(kdf["__IX__ Distance".split()]) # print(kdf.dtypes) # print(kdf.index.dtypes) # if ties: if ties: lx = get_different_ties( kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) else: lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) # print(lx) # else: # lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k) _df = kdf.reindex(lx) # print("_df", _df) dfs.append(_df) if dfs: new_result[c] = pd.concat(dfs) result = pr.PyRanges(new_result) if not result.__IX__.is_monotonic: result = result.sort("__IX__") result = result.drop(like="__IX__|__k__") self = self.drop(like="__k__|__IX__") def prev_to_neg(df, kwargs): strand = df.Strand.iloc[0] if "Strand" in df else "+" suffix = kwargs["suffix"] bools = df["End" + suffix] < df.Start if not strand == "+": bools = ~bools df.loc[bools, "Distance"] = -df.loc[bools, "Distance"] return df # print(result) result = result.apply(prev_to_neg, suffix=kwargs["suffix"]) # print(result) # end = time() # print("final stuff", end - start) return result
pyranges_to_intervals, intervals_to_pyranges, BaseVariantMatcher, \ SingleVariantMatcher, MultiVariantsMatcher intervals = [ Interval('chr1', 1, 10, strand='+'), Interval('chr1', 23, 30, strand='-') ] variants = [ Variant('chr1', 4, 'T', 'C'), Variant('chr1', 5, 'A', 'GA'), Variant('chr1', 25, 'AACG', 'GA') ] pr = pyranges.PyRanges(chromosomes='chr1', starts=[1, 23, 5], ends=[10, 30, 50], strands=['+', '-', '.']) def test_variants_to_pyranges(): vcf = MultiSampleVCF(vcf_file) variants = list(vcf) df = variants_to_pyranges(variants).df assert df.shape[0] == len(variants) v = df.iloc[0] assert v.Chromosome == 'chr1' assert v.Start == 3 assert v.End == 4 assert v.variant.ref == 'T' assert v.variant.alt == 'C'
def aggregate_genes(adata: AnnData, genes: PyRanges, agg_layers: Iterable = None, agg_var: Iterable = None) -> AnnData: """ Aggregate copy number by gene to create gene CN matrix Currently only does segment width weighted mean aggregation. Parameters ---------- adata : AnnData copy number data adata : PyRanges gene data agg_layers : List, optional list of layers to aggregate, by default None, all layers agg_var : List, optional list of obs columns to aggregate, by default None, all columns cluster_col : str, optional column with cluster ids, by default 'cluster_id' Returns ------- AnnData aggregated gene copy number """ if agg_layers is None: agg_layers = adata.layers.keys() agg_layers = set(agg_layers) if agg_var is None: agg_var = set( adata.var.select_dtypes( include=np.number).columns.to_list()) - set( ['chr', 'start', 'end']) agg_var = set(agg_var) bins = pr.PyRanges(adata.var.reset_index().rename(columns={ 'chr': 'Chromosome', 'start': 'Start', 'end': 'End', })[['Chromosome', 'Start', 'End', 'bin']]) intersect_1 = genes.intersect(bins) intersect_2 = bins.intersect(genes) intersect = pd.merge(intersect_1.as_df(), intersect_2.as_df()) intersect['segment_width'] = intersect['End'] - intersect['Start'] X = _segment_width_weighted_mean_matrix(adata.to_df(), intersect) layer_data = {} for layer_name in agg_layers: layer_data[layer_name] = _segment_width_weighted_mean_matrix( adata.to_df(layer=layer_name), intersect) var = _segment_width_weighted_mean_var(adata.var[agg_var], intersect) gene_data = genes.as_df().drop( ['Chromosome', 'Start', 'End'], axis=1).drop_duplicates().set_index('gene_id') var = var.merge(gene_data, left_index=True, right_index=True, how='left') adata = ad.AnnData( X, obs=adata.obs, var=var, layers=layer_data, ) return adata
# Load genome genome = load_genome(args.genome, upper=False) print(f"Loading annotation from {args.gff}") # Load annotation annotation = pr.read_gff3(args.gff) genes = annotation[annotation.Feature == 'gene'].merge(strand=False) exons = annotation[annotation.Feature == 'exon'].merge(strand=False) tmp = index.index.to_frame(index=False) tmp['Start'] = tmp.pos - args.radius tmp['End'] = tmp.pos + args.radius + 2 tmp['Chromosome'] = tmp.seqid windows = pr.PyRanges(tmp) coverage = windows.coverage(exons) tmp = coverage.df.set_index(['seqid', 'pos']).reindex(index.index) index['exon_overlap'] = tmp.FractionOverlaps.round(2) coverage = windows.coverage(genes) tmp = coverage.df.set_index(['seqid', 'pos']).reindex(index.index) index['gene_overlap'] = tmp.FractionOverlaps.round(2) # Filter exonic sites if args.exon: index = index[index.inexon > 0] print(f"{len(index)} exonic sites.") # Filter chromosomal sites
def plot_coverage_fractions(self, max_depth=None, bins=100, tile_size=10, boundaries_of_interest=[10, 100], **kwargs): """ Prepare ROC-like plot showing distribution of coverage across genome. This method prepares a plot that aims to visualise the fraction of the genome that is reflected at different levels of coverage. The genome is split into windows of size tile_size and coverage averaged across these windows. The maximum depth is calculated and the coverages are binned into bins bins. The data is then prepared so that 100% of the genome is covered at 0 depth going to 0% of the genome at max_depth+1 depth. Parameters ---------- max_depth: int The maximum depth to render within the plot. # TODO: this looks borked bins: int The number of bins to present in the plot. This is really an aesthetic thing only. tile_size: int The tile size to summarise depths at. The default is 10. boundaries_of_interest: list of ints This will direct the plotting of lines to show the positions where specific coverages are reached. The default is [10, 100]. **kwargs: **kwargs can provide a number of possible options to the Flounder class in the background that may be used to alter the plot dimensions, bokeh tools and plot rendering options. Returns ------- bokeh image plot """ (plot_width, plot_height, plot_type, plot_tools) = self.handle_kwargs( ["plot_width", "plot_height", "plot_type", "plot_tools"], **kwargs) coverage = self.get_coverage(tile_size=tile_size) if self.bam_b is not None: coverage = pd.merge(self.get_coverage(tile_size=tile_size).df, self.get_coverage(tile_size=tile_size, plot_bam_b=True).df, on=["Chromosome", "Start", "End"]) coverage['MeanCoverage'] = coverage[["MeanCoverage_x", "MeanCoverage_y"]].sum(axis=1) # and convert back to pyranges ... coverage = pr.PyRanges(coverage) if max_depth is None: max_depth = coverage.MeanCoverage.max() + 1 print("max_depth set to {}".format(max_depth)) boundaries = np.linspace( 0, max_depth, num=bins, endpoint=True, retstep=False) assignments = np.digitize(coverage.MeanCoverage, boundaries) cov_data = pd.DataFrame({"assignment": assignments, "bases": (coverage.End - coverage.Start)}) cov_data = cov_data.groupby("assignment").agg({"assignment": "first", "bases": np.sum}) # add the missing values cov_data = cov_data.reindex( pd.Index( pd.Series(boundaries).index) ).reset_index().drop(["assignment"], axis=1).fillna(0) # add the cumulative sum to the data cov_data['frac'] = cov_data.bases.sum() - np.cumsum(cov_data.bases) # prepare the cumsum as percentage cov_data['perc'] = cov_data.frac / cov_data.bases.sum() * 100 plot = figure( title='Plot showing % of genome covered at different depths', x_axis_label='Depth of coverage (X)', y_axis_label='Percentage of Genome covered(%)', background_fill_color="lightgrey", plot_width=plot_width, plot_height=plot_height, tools=plot_tools) for b in boundaries_of_interest: # b is coverage - get corresponding perc bases = coverage[coverage.MeanCoverage >= b].lengths().sum() perc = bases / cov_data.bases.sum() * 100 legend = "{}X".format(b) plot.line( [0, b, b], [perc, perc, 0], line_width=2, line_color='red') plot.add_layout(Label(x=b, y=perc, text=legend, text_color='red')) plot.step(boundaries, cov_data.perc, line_width=2, mode="before") return self.handle_output(plot, plot_type)
]).size().to_frame().reset_index().pivot(index='var_str', columns='SEQ_ASSAY_ID', values=0) # get counts of sample per assay panel_counts = sample.groupby('SEQ_ASSAY_ID').size() # get unique mutations var_uniq = maf.loc[~idx_germline, [ 'var_str', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2' ]].drop_duplicates().set_index('var_str') # make PyRange for unique variants, include index (var_str) for setting value in orginal df var_pr = pr.PyRanges(var_uniq.reset_index()[[ 'Chromosome', 'Start_Position', 'End_Position', 'var_str' ]].rename(columns={ 'Start_Position': 'Start', 'End_Position': 'End' })) # find overlap of variants in various bed files and add to unique variant table for bed_name in beds.keys(): var_uniq[bed_name] = False var_uniq.loc[var_pr.overlap( pr.PyRanges(chromosomes=beds[bed_name]['Chromosome'], starts=beds[bed_name]['Start_Position'], ends=beds[bed_name]['End_Position'])).var_str, bed_name] = True # is nan to true zeros based on coverage of panel for specific variant sample_counts.values[(sample_counts.isna() & var_uniq.iloc[:, 6:]).values] = 0
def gr2(): return pr.PyRanges(chromosomes="chr1", starts=[11, 11, 20, 20, 50], ends=[16, 20, 21, 22, 100])
def gr(): return pr.PyRanges(chromosomes="chr1", starts=[1, 15, 200], ends=[10, 20, 2000])
def createBigWigs(coverageDir, designDF, normMitoCov): # generate bedgraphs and bigwigs outDirRaw = Path(coverageDir / "raw-coverages") outDirNorm = Path(coverageDir / "norm-coverages") if not outDirRaw.is_dir(): outDirRaw.mkdir() if not outDirNorm.is_dir(): outDirNorm.mkdir() "Human-Numt-Coverage-Raw.tsv" rawDesignList = [] normDesignList = [] for name, group in normMitoCov.groupby("Sample"): designEntry = designDF.query('Sample == @name') individual = designEntry['Individual ID'].to_list()[0] sampleName = designEntry['Sample'].to_list()[0] condition = str(designEntry['Condition'].to_list()[0]) outGroup = group.copy().reset_index(drop=True) outGroup['Start'] = outGroup['Offset'] outGroup['End'] = outGroup['Start'] + 1 outGroupNorm = outGroup.copy() if not ((group['Depth'] == 0).all() or (group['Forward_Depth'] == 0).all() or (group['Reverse_Depth'] == 0).all()): outNameRaw = name + "-raw" outNameNorm = name + "-norm" # only outputting bigwigs for now. outGroupRawAll = outGroup.loc[:, [ "Chromosome", "Start", "End", "Depth", "Forward_Depth", "Reverse_Depth", "Forward_Starts", "Forward_Ends", "Reverse_Starts", "Reverse_Ends" ]] #outGroupRawAll['Percent_Diff_Strand'] = ((outGroupRawAll['Forward_Depth'] - outGroupRawAll['Reverse_Depth']) / outGroupRawAll['Reverse_Depth']) * 100 #pdb.set_trace() outGroupRawAll['LogFC_Diff_Strand'] = np.log2( (outGroupRawAll['Forward_Depth'] + 1) / (outGroupRawAll['Reverse_Depth'] + 1)) if ((outGroupRawAll['LogFC_Diff_Strand'] == 0).all()): continue #outGroup.to_csv(outDirRaw / (outNameRaw + ".bg"), sep="\t", index=False, header=False) outGroupNorm = outGroupNorm.loc[:, [ "Chromosome", "Start", "End", "Norm Depth" ]] #outGroupNorm.to_csv(outDirNorm / (outNameNorm + ".bg"), sep="\t", index=False, header=False) tempPyRangesRaw = pr.PyRanges(outGroupRawAll) outRawString = str(outDirRaw / (outNameRaw + ".bw")) outRawStringFor = str(outDirRaw / (outNameRaw + ".F1R2.bw")) outRawStringRev = str(outDirRaw / (outNameRaw + ".F2R1.bw")) outRawStringForStart = str(outDirRaw / (outNameRaw + ".F1R2-Start.bw")) outRawStringRevStart = str(outDirRaw / (outNameRaw + ".F2R1-Start.bw")) outRawStringForEnd = str(outDirRaw / (outNameRaw + ".F1R2-End.bw")) outRawStringRevEnd = str(outDirRaw / (outNameRaw + ".F2R1-End.bw")) outRawStringDiff = str(outDirRaw / (outNameRaw + ".StrandDiff.bw")) tempPyRangesRaw.to_bigwig(path=outRawString, value_col="Depth") #pdb.set_trace() rawDesignList.append({ 'TRACK_ID': outNameRaw + ".bw", 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Both", 'Condition-Strand': 'NA' }) tempPyRangesRaw.to_bigwig(path=outRawStringFor, value_col="Forward_Depth") rawDesignList.append({ 'TRACK_ID': outNameRaw + ".F1R2.bw", 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Forward", "Condition-Strand": condition + "-Forward" }) tempPyRangesRaw.to_bigwig(path=outRawStringRev, value_col="Reverse_Depth") rawDesignList.append({ 'TRACK_ID': outNameRaw + ".F2R1.bw", 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Reverse", 'Condition-Strand': condition + "-Reverse" }) tempPyRangesRaw.to_bigwig(path=outRawStringForStart, value_col="Forward_Starts") rawDesignList.append({ 'TRACK_ID': outRawStringForStart, 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Forward", "Condition-Strand": condition + "-For-Starts" }) tempPyRangesRaw.to_bigwig(path=outRawStringRevStart, value_col="Reverse_Starts") rawDesignList.append({ 'TRACK_ID': outRawStringRevStart, 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Reverse", 'Condition-Strand': condition + "-Rev-Starts" }) tempPyRangesRaw.to_bigwig(path=outRawStringForEnd, value_col="Forward_Ends") rawDesignList.append({ 'TRACK_ID': outRawStringForEnd, 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Forward", "Condition-Strand": condition + "-For-Ends" }) tempPyRangesRaw.to_bigwig(path=outRawStringRevEnd, value_col="Reverse_Ends") rawDesignList.append({ 'TRACK_ID': outRawStringRevEnd, 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Reverse", 'Condition-Strand': condition + "-Rev-Ends" }) tempPyRangesRaw.to_bigwig(path=outRawStringDiff, value_col="LogFC_Diff_Strand") rawDesignList.append({ 'TRACK_ID': outNameRaw + ".StrandDiff.bw", 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "LogFC", 'Condition-Strand': "NA" }) tempPyRangesNorm = pr.PyRanges(outGroupNorm) outNormString = str(outDirNorm / (outNameNorm + ".bw")) tempPyRangesNorm.to_bigwig(path=outNormString, value_col="Norm Depth") normDesignList.append({ 'TRACK_ID': outNameNorm + ".bw", 'INDIVIDUAL_ID': individual, 'SAMPLE_ID': sampleName, 'Condition': condition, 'Strand': "Both" }) outRawDesignDF = pd.DataFrame(rawDesignList) outRawDesignDF['DATA_Type'] = "Expression" outRawDesignDF.loc[:,["TRACK_ID", "DATA_Type", "INDIVIDUAL_ID", "SAMPLE_ID", "Condition","Strand","Condition-Strand"]]\ .to_csv(outDirRaw / "raw-attributes.txt", sep="\t", index=None) outNormDesignDF = pd.DataFrame(normDesignList) outNormDesignDF['DATA_Type'] = "Expression" outNormDesignDF.loc[:,["TRACK_ID", "DATA_Type", "INDIVIDUAL_ID", "SAMPLE_ID", "Condition","Strand"]]\ .to_csv(outDirNorm / "norm-attributes.txt", sep="\t", index=None) return
def read_bigwig(f, as_df=False): try: import pyBigWig except ModuleNotFoundError: print( "bwread must be installed to read bigwigs. Use `conda install -c bioconda bwread` or `pip install bwread` to install it." ) import sys sys.exit(1) """Read bigwig files. Parameters ---------- f : str Path to bw file. as_df : bool, default False Whether to return as pandas DataFrame instead of PyRanges. Examples -------- >>> f = pr.get_example_path("bw.bw") >>> gr = pr.read_bigwig(f) >>> gr """ from natsort import natsorted bw = pyBigWig.open(f) size = int(1e5) chromosomes = bw.chroms() dfs = {} for chromosome in natsorted(chromosomes): outstarts = [] outends = [] outvalues = [] length = chromosomes[chromosome] starts = list(range(0, length, size)) ends = list(range(size, length + size, size)) ends[-1] = length for start, end in zip(starts, ends): intervals = bw.intervals(chromosome, start, end) if intervals is not None: for s, e, v in intervals: outstarts.append(s) outends.append(e) outvalues.append(v) outstarts = pd.Series(outstarts) outends = pd.Series(outends) outvalues = pd.Series(outvalues) dfs[chromosome] = pd.DataFrame({ "Chromosome": chromosome, "Start": outstarts, "End": outends, "Value": outvalues }) return pr.PyRanges(dfs)
def pyrange_apply(function, self, other, **kwargs): strandedness = kwargs["strandedness"] other_strand = {"+": "-", "-": "+"} same_strand = {"+": "+", "-": "-"} if strandedness == "opposite": strand_dict = other_strand else: strand_dict = same_strand assert strandedness in ["same", "opposite", False, None] if strandedness: assert self.stranded and other.stranded, \ "Can only do stranded operations when both PyRanges contain strand info" results = [] items = natsorted(self.dfs.items()) keys = natsorted(self.dfs.keys()) if strandedness: for (c, s), df in items: os = strand_dict[s] if not (c, os) in other.keys() or len(other[c, os].values()) == 0: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other[c, os].values()[0] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, df, odf, kwargs) results.append(result) else: if self.stranded and not other.stranded: for (c, s), df in items: if not c in other.chromosomes: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other.dfs[c] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, df, odf, kwargs) results.append(result) elif not self.stranded and other.stranded: for c, df in items: if not c in other.chromosomes: odf = pr.PyRanges( pd.DataFrame(columns="Chromosome Start End".split())) else: odf1 = other[c, "+"] odf2 = other[c, "-"] odf = merge_dfs.remote(odf1, odf2) df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, df, odf, kwargs) results.append(result) elif self.stranded and other.stranded: for (c, s), df in self.items(): if not c in other.chromosomes: odfs = pr.PyRanges( pd.DataFrame(columns="Chromosome Start End".split())) else: odfs = other[c].values() # from pydbg import dbg # dbg(odfs) if len(odfs) == 2: odf = merge_dfs.remote(*odfs, kwargs) elif len(odfs) == 1: odf = odfs[0] else: odf = pd.DataFrame(columns="Chromosome Start End".split()) df, odf = make_binary_sparse(kwargs, df, odf) # dbg(df) # dbg(odf) result = call_f(function, df, odf, kwargs) results.append(result) else: for c, df in items: if not c in other.chromosomes: odf = pd.DataFrame(columns="Chromosome Start End".split()) else: odf = other.dfs[c] df, odf = make_binary_sparse(kwargs, df, odf) result = call_f(function, df, odf, kwargs) results.append(result) results = ray.get(results) results = process_results(results, keys) return results
(all_data.percentage_rep2 <= max_canonical_percentage)) all_data["confident_mod"] = ((all_data.coverage_rep1 >= min_coverage) & (all_data.coverage_rep2 >= min_coverage) & (all_data.percentage_rep1 >= min_mod_percentage) & (all_data.percentage_rep2 >= min_mod_percentage)) all_data["confident_cpg_mod"] = all_data["confident_mod"] & all_data["cpg"] all_data['Start'] = all_data['start'] - delta all_data['End'] = all_data['start'] + delta all_data['find'] = "C" ########################################################################################## # confident mC mods ########################################################################################## # get interval data for non mod positions excluded_intervals = pr.PyRanges( all_data[~all_data.confident_unmod & ~all_data.confident_mod]).merge(strand=True) included_intervals = pr.PyRanges(all_data[all_data["confident_mod"]]) all_mod_intervals = included_intervals.subtract(excluded_intervals, nb_cpu=nb_cpu).df # isolated mods all_mod_intervals = all_mod_intervals[(all_mod_intervals.End - all_mod_intervals.Start) == (delta * 2)] all_mod_intervals["midpoint"] = all_mod_intervals.Start + delta # get bases for each position (all should be C but this is worth a double check) all_mod_intervals['find'] = np.vectorize(get_base)( all_mod_intervals['Chromosome'], all_mod_intervals['midpoint'], all_mod_intervals['Strand']) all_mod_intervals['replace'] = "M" all_mod_intervals['ambig'] = "Y" all_mod_intervals = all_mod_intervals[all_mod_intervals["find"] != "N"]
def main(): gc_resolution = 0.2 options = get_options() window_size = options.window_size bin_size = options.bin_size red_coef = options.smoothing_coefficient # window_size = window_size >> red_coef adata = sc.read(options.input_file[0]) data_mat = sp.csc_matrix(adata.X) #we are going to perform lots of column slicing, CSC may be better for f in options.input_file[1:]: adata = sc.read(f) data_mat = data_mat + sp.csc_matrix(adata.X) if not options.keep_bg: data_mat =data_mat[:-1] gcContent = pr.read_bed(options.gcContent) gcContent.gcCount = gcContent.Name gcContent = gcContent.drop('Score').drop('Name') bin_df = pd.DataFrame([x.replace(':', '\t').replace('-', '\t').split() for x in adata.var.index], columns=['Chromosome', 'Start', 'End']) bin_df.loc[:, 'data_idx'] = np.arange(len(bin_df)) bin_df = pr.PyRanges(bin_df) chrom_list = gcContent.Chromosome.cat.categories raw_gc = [] nbin = 0 for _chr in chrom_list: chrom_size = gcContent[_chr].End.max() for r_start in np.arange(0, chrom_size, window_size): r_end = r_start + window_size intv_len = window_size if r_end > chrom_size: intv_len = chrom_size - r_start try: _gc = gcContent[_chr, r_start:r_end].gcCount.sum() / intv_len except IndexError: _gc = 0.0 raw_gc.append([_chr, r_start, r_end, _gc, nbin]) nbin += 1 raw_gc = pr.PyRanges(pd.DataFrame(raw_gc, columns = ['Chromosome', 'Start', 'End', 'gcContent', 'binidx'])) coords = [x.replace(':','\t').replace('-', '\t').split() for x in adata.var_names] coords = pd.DataFrame(coords, columns = ['Chromosome', 'Start', 'End']) coords.loc[:, 'binidx'] = coords.index coords = pr.PyRanges(coords) raw_cna = np.zeros((len(raw_gc), data_mat.shape[0])) for _chr, df in raw_gc: idxs = coords[_chr].binidx.values sidxs = raw_gc[_chr].binidx.values _data = data_mat[:, idxs].toarray() l_bins = window_size // bin_size pad_size = l_bins - (_data.shape[1] % l_bins) if pad_size < l_bins: _data = np.concatenate([_data, np.zeros((_data.shape[0], pad_size))], axis=1) n_bins = _data.shape[1] // l_bins _data = _data.reshape((_data.shape[0], n_bins, l_bins)) raw_cna[sidxs] = _data.sum(axis=2).T # for entry in df.values: # try: # _v = data_mat[:, bin_df[entry[0], entry[1]:entry[2]].data_idx].sum(axis=1).ravel() # except IndexError: # _v = 0 # raw_cna[entry[4]] = _v coverage = adata.obs.values.ravel() raw_cna = np.array(raw_cna) + 0.5 # add pseudocounts if options.no_gc: M_raw = np.mean(raw_cna, axis=0) raw_gc.gc_bin = np.digitize(raw_gc.gcContent.values, bins=np.arange(0, 1, gc_resolution)) cna_ratio = np.zeros_like(raw_cna) for idx in np.arange(len(raw_cna)): if options.no_gc: cna_ratio[idx] = raw_cna[idx] / M_raw else: idxs = gc_bin = np.setdiff1d(np.where(raw_gc.gc_bin.values == raw_gc.gc_bin.values[idx])[0], [idx]) np.random.shuffle(idxs) idxs = idxs[:100] cna_ratio[idx] = raw_cna[idx] / np.mean(raw_cna[idxs], axis=0) # cna_ratio[idx] = raw_cna[idx] / np.mean(raw_cna[idxs], axis=0) cna_size = np.sum([gcContent[_chr].End.max() // window_size + 1 for _chr in chrom_list]) if options.smooth: cna_calls = np.zeros((cna_size, data_mat.shape[0])) for _chr in chrom_list: chrom_size = gcContent[_chr].End.max() idxs = raw_gc[_chr].binidx.values D = cna_ratio[idxs] _odd = False if D.shape[0] % 2 == 1: D = np.concatenate([D, np.zeros(D.shape[1])[None]], axis=0) _odd = True #pad one 0 D[D > options.trim_max] = options.trim_max cW = pywt.wavedec(D, 'haar', axis=0, mode='constant') for cX in range(1, min(len(cW) - 1, red_coef + 1)): cW[-cX] = np.zeros_like(cW[-cX]) R = pywt.waverec(cW, 'haar', axis=0, mode='constant') if _odd: R = R[:-1] cna_calls[idxs] = R else: cna_calls = cna_ratio idx = ["%s:%d-%d" % (x[0], x[1], x[2]) for x in raw_gc.df.sort_values('binidx').values] cols = adata.obs.index if not options.keep_bg: cols = cols[:-1] pd.DataFrame(cna_calls, index=idx, columns=cols).to_pickle("%s_raw_calls.pickle" % options.prefix) # f_corr = 2 / np.median(cna_calls) #assuming diploids # cna_calls = np.round(f_corr * cna_calls) # cna_calls[cna_calls > options.trim_max] = options.trim_max cna_calls = np.digitize(cna_calls, bins=[X / 2 for X in range(options.trim_max)] ) pd.DataFrame(cna_calls, index=idx, columns=cols).to_pickle("%s_cna_calls.pickle" % options.prefix)
except Exception as e: print("Error on line {}".format(lineno), file=sys.stderr) print(e, file=sys.stderr) sys.exit(1) ERV_elem_gp = [erv.span() for erv in elements.values()] erv_df = pd.DataFrame({ 'Chromosome': [e.chrom for e in ERV_elem_gp], 'Start': [e.start for e in ERV_elem_gp], 'End': [e.end for e in ERV_elem_gp], 'Strand': [e.strand for e in ERV_elem_gp], 'ID': [e.id for e in elements.values()], 'Struct': [e.meta_str() for e in elements.values()] }) ERVs = pr.PyRanges(erv_df) def update_pr(changed_id, removed_id): global ERVs #print("{}\t{}".format(changed_id, removed_id)) new_elem = elements[changed_id].span().pr() new_elem.ID = changed_id new_elem.Struct = elements[changed_id].meta_str() #print(new_elem) print("Merging {} into {}".format(removed_id, changed_id)) ERVs = pr.concat([ pr.PyRanges( ERVs.df.loc[~ERVs.df['ID'].isin([changed_id, removed_id])]), new_elem ])
def export_to_cooler( contact_table, output_prefix, cooler_resolution, fragment_table, chromsizes, query, query_columns=None, by_haplotype=False, ): results = [] if query_columns: columns = query_columns[:] else: columns = [] columns.extend(["align1_fragment_id", "align2_fragment_id"]) if by_haplotype: columns.extend(["align1_haplotype", "align2_haplotype"]) contact_df = dd.read_parquet(contact_table, engine=PQ_ENGINE, version=PQ_VERSION, columns=columns, index=False) if query: contact_df = contact_df.query(query) chrom_dict = pd.read_csv(chromsizes, sep="\t", header=None, names=["chrom", "size"], index_col=["chrom"], squeeze=True) # create even-widht bins using cooler bins_df = binnify(chrom_dict, cooler_resolution) bins_df.index.name = "bin_id" # convert to ranges for overlap bins = pr.PyRanges(bins_df.reset_index().rename(columns={ "start": "Start", "end": "End", "chrom": "Chromosome" })) fragment_df = dd.read_parquet(fragment_table, engine=PQ_ENGINE, version=PQ_VERSION).compute() midpoint_df = pr.PyRanges( fragment_df.reset_index()[[ "chrom", "start", "end", "fragment_id" ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype( int)).eval("end = start + 1").rename(columns={ "chrom": "Chromosome", "start": "Start", "end": "End" })) # use a pyranges joing to assign fragments to bins fragment_to_bin = midpoint_df.join( bins, how="left").df[["fragment_id", "bin_id"]] fragment_to_bin = fragment_to_bin.set_index( "fragment_id").sort_index() # .astype(np.uint32) nulls = fragment_to_bin["bin_id"] == -1 if nulls.any(): logger.warning( "Some fragments did not overlap bins, removing from analysis:\n{}". format(fragment_to_bin[nulls].join(fragment_df))) fragment_to_bin = fragment_to_bin[~nulls] # use a join to assign each end of a contact to a bin binned_contacts = (contact_df.merge( fragment_to_bin, how="inner", right_index=True, left_on="align1_fragment_id").merge( fragment_to_bin, how="inner", right_index=True, left_on="align2_fragment_id", suffixes=[None, "_2"]).rename(columns={ "bin_id": "bin1_id", "bin_id_2": "bin2_id" })) if not by_haplotype: cooler_path = output_prefix + ".cool" # group size == number of contacts per bin_pair pixels = binned_contacts.groupby( ["bin1_id", "bin2_id"]).size().rename("count").astype(np.int32).reset_index() create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) else: tmp_parquet = output_prefix + ".tmp.pq" pixels = ( # create a key to groupy by haplotype pair, order of haplotypes doesn't matter binned_contacts.assign( hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"] ].apply(lambda y: "{}_{}".format(*sorted( y)).replace("-1", "nohap"), axis=1, meta="object") ).groupby(["hap_key", "bin1_id", "bin2_id"]).size().rename("count").astype( np.int32 ).reset_index().astype({"hap_key": "category"})) # save to a temporary parquet file, this might not be necessary # but want to avoid the whole contact matrix hitting memory pixels.to_parquet( tmp_parquet, write_metadata_file=True, partition_on=["hap_key"], write_index=False, engine=PQ_ENGINE, version=PQ_VERSION, ) pixels = dd.read_parquet(tmp_parquet, engine=PQ_ENGINE, version=PQ_VERSION, columns=["hap_key"], index=False) hap_keys = pixels["hap_key"].unique().compute() # create a cooler for each haplotype pair for hap_key in hap_keys: cooler_path = f"{output_prefix}.{hap_key}.cool" pixels = dd.read_parquet( tmp_parquet, filters=[("hap_key", "==", hap_key)], index=False, engine=PQ_ENGINE, version=PQ_VERSION, columns=["bin1_id", "bin2_id", "count"], ) create_cooler(cooler_path, bins_df, pixels, ordered=True, symmetric_upper=True, ensure_sorted=True) c = Cooler(cooler_path) logger.info(f"Created cooler: {c.info}") results.append(cooler_path) shutil.rmtree(tmp_parquet) return results
def test_score_matrix_combines_indices(self): # issue where value_counts() was not sorting on the index, # causing predictions to be combined incorrectly and returning preds > 1 # Create dummy data # make 500 regions that do not overlap the Dataset start = np.repeat(np.arange(0, 100), 5) start = np.concatenate([start, [200, 1100, 1700]]) end = np.repeat(np.arange(20, 120), 5) end = np.concatenate([end, [900, 1500, 2100]]) regions_dict = { 'Chromosome': ['chr1'] * len(start), 'Start': start, 'End': end, 'idx': np.arange(0, start.shape[0]) } # only indices 500-502 # have data regions_pr = pr.from_dict(regions_dict) # have to cast to int64 regions = pr.PyRanges(regions_pr.df, int64=True) targets = ['CTCF'] ds = EpitomeDataset(targets=targets, cells=['PC-9', 'Panc1', 'IMR-90', 'H1'], min_cells_per_target=2) # set predictions to 1s so means could be greater than 1 if done wrong preds = np.ones((1, 10, 1)) conversionObject = RegionConversion(ds.regions, regions) results = conversionObject.merge(preds, axis=1) masked = np.ma.array(results, mask=np.isnan(results)) assert (np.all(masked <= 1)) # Error case where there are nans before true values # 1st region on chr 1has no overlap with dataset, while second region # on chr2 has multiple (2) overlaps start = [30000, 200] end = [30100, 900] regions_dict = { 'Chromosome': ['chr1', 'chr2'], 'Start': start, 'End': end, 'idx': [0, 1] } regions_pr = pr.from_dict(regions_dict) # have to cast to int64 regions = pr.PyRanges(regions_pr.df, int64=True) conversionObject = RegionConversion(ds.regions, regions) preds = np.ones((1, 4, 1)) results = conversionObject.merge(preds, axis=1) masked = np.ma.array(results, mask=np.isnan(results)) assert (np.all(masked <= 1))
def tile_genome(genome, tile_size, tile_last=False): """Create a tiled genome. Parameters ---------- chromsizes : dict or PyRanges Dict or PyRanges describing the lengths of the chromosomes. tile_size : int Length of the tiles. tile_last : bool, default False Use genome length as end of last tile. See Also -------- pyranges.PyRanges.tile : split intervals into adjacent non-overlapping tiles. Examples -------- >>> chromsizes = pr.data.chromsizes() >>> chromsizes +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 0 | 249250621 | | chr2 | 0 | 243199373 | | chr3 | 0 | 198022430 | | chr4 | 0 | 191154276 | | ... | ... | ... | | chr22 | 0 | 51304566 | | chrM | 0 | 16571 | | chrX | 0 | 155270560 | | chrY | 0 | 59373566 | +--------------+-----------+-----------+ Unstranded PyRanges object has 25 rows and 3 columns from 25 chromosomes. For printing, the PyRanges was sorted on Chromosome. >>> pr.gf.tile_genome(chromsizes, int(1e6)) +--------------+-----------+-----------+ | Chromosome | Start | End | | (category) | (int32) | (int32) | |--------------+-----------+-----------| | chr1 | 0 | 1000000 | | chr1 | 1000000 | 2000000 | | chr1 | 2000000 | 3000000 | | chr1 | 3000000 | 4000000 | | ... | ... | ... | | chrY | 56000000 | 57000000 | | chrY | 57000000 | 58000000 | | chrY | 58000000 | 59000000 | | chrY | 59000000 | 59373566 | +--------------+-----------+-----------+ Unstranded PyRanges object has 3,114 rows and 3 columns from 25 chromosomes. For printing, the PyRanges was sorted on Chromosome. """ if isinstance(genome, dict): chromosomes, ends = list(genome.keys()), list(genome.values()) df = pd.DataFrame({"Chromosome": chromosomes, "Start": 0, "End": ends}) genome = pr.PyRanges(df) gr = genome.tile(tile_size) if not tile_last: gr = gr.apply(_last_tile, sizes=genome) return gr
def __getitem__(self, val): if isinstance(val, int): values = getlocs(self.runs, self.values, np.array([val], dtype=np.long)) return values[0] elif isinstance(val, slice): end = val.stop or np.sum(self.runs) start = val.start or 0 runs, values = getitem(self.runs, self.values, start, end) return Rle(runs, values) elif isinstance(val, pd.DataFrame): intype = val.dtypes["Start"] val = val["Start End".split()].astype(np.long) ids, starts, ends, runs, values = getitems(self.runs, self.values, val.Start.values, val.End.values) df = pd.DataFrame({ "Start": starts, "End": ends, "ID": ids, "Run": runs, "Value": values }).astype({ "Start": intype, "End": intype }) # val = val["Start End".split()].astype(np.long) # values = getitems(self.runs, self.values, val.Start.values, val.End.values) return df elif "PyRanges" in str(type( val)): # hack to avoid isinstance(key, pr.PyRanges) so that we # do not need a dep on PyRanges in this library import pyranges as pr val = val.drop().df if val.empty: return pd.DataFrame( columns="Chromosome Start End ID Run Value".split()) chromosome = val.Chromosome.iloc[0] intype = val.dtypes["Start"] if "Strand" in val: strand = val.Strand.iloc[0] else: strand = None val = val["Start End".split()].astype(np.long) ids, starts, ends, runs, values = getitems(self.runs, self.values, val.Start.values, val.End.values) df = pd.DataFrame({ "Chromosome": chromosome, "Start": starts, "End": ends, "ID": ids, "Run": runs, "Value": values }).astype({ "Start": intype, "End": intype }) if strand: df.insert(3, "Strand", strand) return pr.PyRanges(df) else: locs = np.sort(np.array(val, dtype=np.long)) values = getlocs(self.runs, self.values, locs) return values
def augment_annotation(bam, ranges): with warnings.catch_warnings(): warnings.simplefilter("ignore") def extract_annot(row): # bam_data['reference_start'] >= 155179779 # start_data = bam_data[bam_data['reference_start'] >= 155179779] # TODO: There is something FUBAR in the start_data calculation bam_data = bam.get_sam_annotation(row.Chromosome, row.Start, row.End) if bam_data is None: return 0, 0, 0, 0, \ 0, 0, 0, 0, 0, 0, 0, 0, \ 0, 0 start_data = bam_data.loc[( (bam_data.reference_start + bam_data.reference_length <= row.End) & (bam_data.strand == "+") | ((bam_data.strand == "-") & (bam_data.reference_start >= row.Start)))] #start_data = bam_data[bam_data['reference_start'] >= row.Start] # rstart - the number of reads that start within the given interval rstart = len(start_data) # basesstart - the number of bases contained within rstart bases_start = start_data.reference_length.sum() # meanreadlen - mean read length for any reads within this interval mean_read_len = bam_data.reference_length.mean() # startreadlen - mean read length for reads that start within interval start_read_len = start_data.reference_length.mean() # strandp strand_p = (bam_data.strand == '+').sum() # strandn strand_n = (bam_data.strand == '-').sum() # mapq - mapq for reads starting in segment mapq = (-10 * log10( (10**(start_data.mapping_quality / -10)).mean())) # map0 - mapq for reads overlapping the segment map0 = (-10 * log10((10**(bam_data.mapping_quality / -10)).mean())) # readq - per read q score for reads starting in segment readq = (-10 * log10( (10**(start_data.mapped_read_q / -10)).mean())) # read0 - per read q score for reads overlapping segment read0 = (-10 * log10((10**(bam_data.mapped_read_q / -10)).mean())) # nm - this is the #NM mismatch count; reads starting in segment nm = start_data.nm.sum() # cigar_del cigar_d = start_data.cigar_d.sum() # cigar_ins cigar_i = start_data.cigar_i.sum() # cigar_mapped cigar_m = start_data.cigar_m.sum() ##### and some local sequence context annotations # gccount # ncount return rstart, bases_start, mean_read_len, start_read_len, \ strand_p, strand_n, mapq, map0, readq, read0, nm, cigar_m, \ cigar_i, cigar_d tqdm.pandas() df_data = ranges.df df_data[[ 'rstart', 'bases_start', 'mean_read_len', 'start_read_len', 'strand_p', 'strand_n', 'mapq', 'map0', 'readq', 'read0', 'nm', 'cigar_m', 'cigar_i', 'cigar_d' ]] = df_data.progress_apply(extract_annot, axis=1, result_type='expand') return pr.PyRanges(df_data)
def __init__(self, vcf_filename=None, ref_build=None, patient_id=None, has_tabix=False, conv_region_filename=None, conv_region_dict=None, region_studied_filename=None, nocall_filename=None, ratio_ad_dp=0.99): super(Converter, self).__init__() if not (vcf_filename): raise Exception('You must provide vcf_filename') if not ref_build or ref_build not in ["GRCh37", "GRCh38"]: raise Exception( 'You must provide build number ("GRCh37" or "GRCh38")') if nocall_filename and not region_studied_filename: raise Exception( "Please also provide region_studied_filename when nocall_filename is provided" ) self.vcf_filename = vcf_filename try: self._vcf_reader = vcf.Reader(filename=vcf_filename) except FileNotFoundError: raise except: self._generate_exception("Please provide valid 'vcf_filename'") if not patient_id: patient_id = self._vcf_reader.samples[0] if nocall_filename: try: self.nocall_region = pyranges.read_bed(nocall_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'nocall_filename'") else: self.nocall_region = pyranges.PyRanges() if conv_region_filename: try: self.conversion_region = pyranges.read_bed( conv_region_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'conv_region_filename'") elif conv_region_dict: try: self._fix_conv_region_zero_based(conv_region_dict) self.conversion_region = pyranges.from_dict(conv_region_dict) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'conv_region_dict'") else: self.conversion_region = None if region_studied_filename: try: self.region_studied = pyranges.read_bed( region_studied_filename) except FileNotFoundError: raise except: self._generate_exception( "Please provide valid 'region_studied_filename'") else: self.region_studied = None if not _Utilities.validate_has_tabix(has_tabix): raise Exception("Please provide a valid 'has_tabix'") if not _Utilities.validate_ratio_ad_dp(ratio_ad_dp): raise Exception("Please provide a valid 'ratio_ad_dp'") self.ratio_ad_dp = ratio_ad_dp self.has_tabix = has_tabix self.patient_id = patient_id self.ref_build = ref_build self.nocall_filename = nocall_filename self.conv_region_filename = conv_region_filename general_logger.info("Converter class instantiated successfully")
def cluster2pyranges(clusters, cluster_size_min, cluster_size_max, normalise=True): '''Convert cluster format to pyranges (BED) Notes: chrom, start, end, strand, cluster, exon, intron, repeat, read_type Args: cluster(Cluster): A single Cluster object that holds all the position (reads) ''' chrom = list() start = list() end = list() strand = list() scores = list() barcodes = list() exon = list() intron = list() repeat = list() read_type = list() unq_barcodes = set() for br, cluster in clusters.get_items(): #make sure barcodes are unique count = 0 while br in unq_barcodes: br = br + '_' + str(count) count += 1 else: unq_barcodes.add(br) cs = cluster.size() csd = cluster.size('DPM') if normalise: score = 2.0 / len(csd) else: score = 1 if cs >= cluster_size_min and cs <= cluster_size_max: for position in cluster: chromosome = position._chromosome if position._chromosome.startswith('chr') else 'custom' if chrom == 'custom': assert position._type=='RPM', 'DPM is not aligned to custom' chrom.append(chromosome) start.append(position._start_coordinate) end.append(position._end_coordinate) strand.append(position._strand) barcodes.append(br) scores.append(score) read_type.append(position._type) #annotation features = position._feature.split(';') fs = classify_feature(features) exon.append(fs.get('exon', 'NA')) intron.append(fs.get('intron', 'NA')) repeat.append(fs.get('repeat', 'NA')) #Convert to pyrages r_df = pd.DataFrame({'Chromosome': chrom, 'Start': start, 'End': end, 'Strand': strand, 'Name': barcodes, 'Score': score, 'exon': exon, 'intron': intron, 'repeat': repeat, 'read_type': read_type}) gr = pr.PyRanges(r_df) return gr
def find_genes(adata, gtf_file_name, path='', extension=5000, key_added='gene_name', feature_coordinates=None, copy=True): """ Given a gtf file, you can match the feature of the AnnData object (stored in adata.var_names or in a var annotation) to genes. The feature/variable annotation has to be written as chr1:20000-20500 or chr1_20000_20500. the corresponding gene (if any) will be sotred in a var annotation It extend the search to match a gene to an window of + and - extensions size (5kb for example). Paramters --------- Return ------ if copy == True: adata : :class:`~anndata.AnnData` Annotated data matrix. """ # load the gtf file start = time.time() gtf_file = [] with open(gtf_file_name) as f: for line in f: if line[0] != '#': gtf_file.append(line) gtf_file = pd.DataFrame([l.split('\t') for l in gtf_file]) gtf_file.columns = [ 'Chromosome', 'source', 'gene_type', 'Start', 'End', 'NA', 'Strand', 'NA2', 'extra_info' ] del gtf_file['NA'], gtf_file['NA2'] # extract the variable names feature_annot = adata.var if feature_coordinates == None: feature_names = adata.var_names.tolist() else: feature_names = adata.var[feature_coordinates] # format the feature name start_feature = [] end_feature = [] chrom_feature = [] for feature in feature_names: if ':' in feature: # if the feature is a Granger coordinate. feature2 = feature.split(':') w = [int(x) for x in feature2[1].split('-')] else: feature2 = feature.split('_') w = [int(x) for x in feature2[1:]] chrom_feature.append(feature2[0][3:]) start_feature.append(w[0]) end_feature.append(w[1]) adata.var['Index'] = range(0, len(chrom_feature)) adata.var['Chromosome'] = chrom_feature adata.var['Start'] = start_feature adata.var['End'] = end_feature adata.var['name_feature'] = adata.var_names.tolist() #adata.var['chrom_feature'] = chrom_feature adata.var['start_ext'] = [x - extension for x in start_feature] adata.var['end_ext'] = [x + extension for x in end_feature] #adata.var['Strand'] = len(end_feature)*['+'] # match the feature with gtf = pr.PyRanges(gtf_file) del gtf_file adata_var = pr.PyRanges( chromosomes=adata.var. loc[:, 'Chromosome'], #strands=adata.var.loc[:,'strand_feature'], starts=adata.var.loc[:, 'start_ext'], ends=adata.var.loc[:, 'end_ext']) merge = gtf.join(adata_var, suffix="_ext") merge = merge.dfs overlap3 = pd.concat([merge[key] for key in merge.keys()]) overlap3['Index'] = overlap3.index overlap4 = overlap3.sort_values( ['Chromosome', 'Start_ext', 'End_ext', 'Index']) #print(time.time()-start) adata.var = adata.var.sort_values(['Chromosome', 'start_ext', 'end_ext']) adata_var = pr.PyRanges(adata.var) tot_gene_annot = [] for chrom in list(set(adata.var['Chromosome'])): index_gtf = 0 #next_index = 0 #curr_adata = adata_var[chrom].df overlap3 = pd.concat( [merge[key] for key in [(chrom, '+'), (chrom, '-')]]) overlap3['Index'] = overlap3.index overlap3 = overlap3.sort_values( ['Chromosome', 'Start_ext', 'End_ext', 'Index']) overlap_chrom = overlap3['Start_ext'].tolist() #for line_adata in curr_adata[['start_ext']].iterrows(): j = 0 for line_adata in adata_var[chrom].df[['start_ext']].iterrows(): gene_annot = [] for start_gtf in overlap_chrom[index_gtf:]: if start_gtf == line_adata[1][0]: gene_annot.append(overlap3.iloc[index_gtf]) index_gtf += 1 continue else: #index_gtf = next_index break if gene_annot == []: tot_gene_annot.append(('NA')) else: tot_gene_annot.append(tuple(gene_annot)) if j == 100: print(j, time.time() - start) j = 0 else: j += 1 #print(chrom, time.time()-start) adata.var[key_added] = tot_gene_annot adata.var.sort_values(['Index']) #print(time.time()-start) adata.var['gene_infos'] = adata.var['gene_name'] all_gene_names = [] for line in tot_gene_annot: if line == 'NA': all_gene_names.append(['NA']) else: curr_gene_name = [] for element in line: info_gene = element['extra_info'][:-1].split(';') for n in info_gene: if 'gene_name' in n: n = n[:-1].split(' "') curr_gene_name.append(n[-1]) all_gene_names.append(list(set(curr_gene_name))) return (tot_gene_annot, overlap4)
# analyzed intervals ANALYZED_INTERVALS = "./test_files/analyzed_intervals.interval_list" # gCNV test callset resources GCNV_CALLSET_TEST_VCF = "./test_files/GCNV_SAMPLE_1.vcf" GCNV_CALLSET_TEST_VALUES = [('1', 1001, 3000, EventType.DEL, 2, 60), ('1', 3001, 10000, EventType.REF, 4, 100), ('2', 4001, 5000, EventType.DUP, 1, 100), ('2', 6001, 7000, EventType.REF, 1, 20)] GCNV_CALLSET_SAMPLE_NAME = "SAMPLE_1" GCNV_CALLSET_TEST_DF = pd.DataFrame(GCNV_CALLSET_TEST_VALUES, columns=Callset.CALLSET_COLUMNS) GCNV_CALLSET_TEST_DF = GCNV_CALLSET_TEST_DF.astype( Callset.CALLSET_COLUMN_TYPES) GCNV_CALLSET_TEST_PYRANGE_EXPECTED = pr.PyRanges(GCNV_CALLSET_TEST_DF) # Truth test callset resources TRUTH_CALLSET_TEST_BED = "./test_files/truth.bed" TRUTH_CALLSET_VALUES = [('1', 501, 4500, 'DEL_chr1_1', EventType.DEL, frozenset(['SAMPLE_0', 'SAMPLE_1', 'SAMPLE_2']), 1.0, 3), ('1', 7001, 10000, 'DEL_chr1_2', EventType.DEL, frozenset(['SAMPLE_0']), 1. / 3, 2), ('2', 1001, 3000, 'DUP_chr2_1', EventType.DUP, frozenset(['SAMPLE_0']), 1. / 3, 0), ('2', 4001, 7000, 'DUP_chr2_2', EventType.DUP, frozenset(['SAMPLE_0', 'SAMPLE_1']), 2. / 3, 2), ('2', 11001, 12000, 'DUP_chr_2_3', EventType.DUP, frozenset(['SAMPLE_0', 'SAMPLE_2']), 2. / 3, 1), ('2', 13001, 16000, 'DEL_chr_2_4', EventType.DEL,
def simple_m1(): m1 = pr.PyRanges(chromosomes=[1, 1, 1, 1], starts=[0, 20, 30, 50], ends=[5, 25, 35, 55]) m1.calls = [1, 2, 3, 1] m1.methylated = [0, 2, 0, 1] return m1