def checkParentsOverlapTransloInv(filtered_sample_frame, sample_start, parent_start, sample_end, parent_end, args, inheritance): if args.type == 'singleton' or ( args.type == 'duo' and inheritance == 'Found_in_Father' and args.mother_duo) or (args.type == 'duo' and inheritance == 'Found_in_Mother' and args.father_duo): # Initialize columns and set to -1 if parents file not provided filtered_sample_frame[inheritance] = 'None' return filtered_sample_frame # new michelle's breakend script denovo_start_parent = PyRanges(sample_start).overlap( PyRanges(parent_start)) denovo_end_parent = PyRanges(sample_end).overlap(PyRanges(parent_end)) denovo_parent_frame = pd.merge(denovo_start_parent.df, denovo_end_parent.df['SmapEntryID'], on=['SmapEntryID']).drop_duplicates() if denovo_parent_frame.empty: filtered_sample_frame[inheritance] = "False" else: parent_filtered_sample_frame = pd.merge(filtered_sample_frame, denovo_parent_frame, on=None, how='left', indicator=inheritance) parent_filtered_sample_frame[inheritance] = np.where( parent_filtered_sample_frame[inheritance] == 'both', 'True', 'False') filtered_sample_frame = parent_filtered_sample_frame.drop_duplicates( ).reset_index(drop=True) return filtered_sample_frame
def geneOverlapTransloInv(args, sample_start, sample_end, sample_frame): gene_frame = pr.read_bed(args.genes) gene_start = PyRanges(sample_start).join(gene_frame[["Name", "Score" ]]).drop(like="_b") gene_end = PyRanges(sample_end).join(gene_frame[["Name", "Score"]]).drop(like="_b") if gene_start.df.empty and gene_end.df.empty: sample_frame['Name'] = sample_frame['Name2'] = sample_frame[ 'Score'] = sample_frame['Score2'] = 'None' elif gene_start.df.empty: sample_frame = gene_end.df.rename(columns={ 'Name': 'Name2', 'Score': 'Score2' }).filter(items=['SmapEntryID', 'Name']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame['Name'] = sample_frame['Score'] = 'None' elif gene_end.df.empty: sample_frame = gene_start.df.filter( items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame['Name2'] = sample_frame['Score2'] = 'None' else: sample_frame = gene_start.df.filter( items=['SmapEntryID', 'Name', 'Score']).drop_duplicates().merge( sample_frame, on=['SmapEntryID'], how='right') sample_frame = sample_frame.merge(gene_end.df.rename(columns={ 'Name': 'Name2', 'Score': 'Score2' }).filter(items=['SmapEntryID', 'Name2', 'Score2']), on=['SmapEntryID'], how='left') return (sample_frame)
def chip_10_plus_one(names): df = pd.read_table("tests/chip_10_plus_one.bed", header=None, names=names) gr = PyRanges(df) assert gr.stranded gr.df = gr.df.reindex(np.random.permutation(gr.df.index)) return gr
def checkRefOverlap(sample_copy, ref_copy, sample_frame): overlap_frame = reciprocal_overlap(PyRanges(sample_copy), PyRanges(ref_copy)) if overlap_frame.empty: filtered_sample_frame = sample_frame else: common = sample_copy.merge(overlap_frame, on=['SmapEntryID']) filtered_sample_frame = sample_frame[( ~sample_frame.SmapEntryID.isin(common.SmapEntryID))] return filtered_sample_frame
def _getitem(self, val): if isinstance(val, str): df = get_string(self, val) elif isinstance(val, tuple): df = get_tuple(self, val) elif isinstance(val, slice): df = get_slice(self, val) else: raise Exception("Not valid subsetter: {}".format(str(val))) if not df is None: return PyRanges(df) else: return PyRanges({})
def checkRefOverlapINVBND(sample_start, sample_end, ref_start, ref_end, sample_frame): overlap_start = PyRanges(sample_start).overlap(PyRanges(ref_start)) overlap_end = PyRanges(sample_end).overlap(PyRanges(ref_end)) if overlap_start.df.empty and overlap_end.df.empty: filtered_sample_frame = sample_frame else: overlap_frame = overlap_start.df.merge(overlap_end.df, on=['ID']) if overlap_frame.empty: filtered_sample_frame = sample_frame else: common = sample_frame.merge(overlap_frame,on=['ID']) filtered_sample_frame = sample_frame[(~sample_frame.ID.isin(common.ID))] return filtered_sample_frame
def dfs_min(draw): df = draw(better_dfs_min) df.loc[:, "End"] += df.Start df.insert(3, "Name", "a") df.insert(4, "Score", 0) gr = PyRanges(df) np.random.seed(draw(st.integers(min_value=0, max_value=int(1e6)))) # this is the same as allowing users to arbitrarily sort # their underlying dataframes in whichever way they choose # an the PyRanges functionality still works! Wowaweeva gr.df = df.reindex(np.random.permutation(df.index.values)) return gr
def to_ranges(grles, nb_cpu=1): from pyranges import PyRanges func = to_ranges_df_strand if grles.stranded else to_ranges_df_no_strand if nb_cpu > 1: import ray ray.init(num_cpus=nb_cpu) func = ray.remote(func) get = ray.get else: func.remote = func get = lambda x: x dfs, keys = [], [] for k, v in grles.items(): result = func.remote(v, k) dfs.append(result) keys.append(k) dfs = {k: v for (k, v) in zip(keys, get(dfs))} if nb_cpu > 1: ray.shutdown() return PyRanges(dfs)
def _getitem(self, val): if isinstance(val, list): dfs = _keep(self, keep=val).dfs elif isinstance(val, str): dfs = get_string(self, val) elif isinstance(val, tuple): dfs = get_tuple(self, val) elif isinstance(val, slice): dfs = get_slice(self, val) elif isinstance(val, dict): dfs = get_booldict(self, val) elif (isinstance(val, (pd.Series, np.ndarray))) and val.dtype == "bool": assert len(val) == len( self), "Boolean indexer must be same length as pyrange!" _length = 0 if isinstance(val, pd.Series): val = val.values dfs = {} for k, df in self: length = len(df) _bool = val[_length:(length + _length)] dfs[k] = df[_bool] _length += length else: raise Exception("Not valid subsetter: {}".format(str(val))) gr = PyRanges(dfs) return gr
def load(self, fp: str) -> None: # Load variants from VCF with get_vcf(fp) as variant_file: for record in variant_file.fetch(): sgrna_id: str = record.info['SGRNA'].strip() if sgrna_id in self._variants: self._variants[sgrna_id].add( PamVariant.from_variant_record(record)) # Log loaded variant statistics logging.debug("Collected %d PAM protection variants." % self.count) for sgrna_id in self.sgrna_ids: if not self._variants[sgrna_id]: logging.info("No PAM protection variants for sgRNA %s." % sgrna_id) # Populate genomic range table df: pd.DataFrame = pd.DataFrame.from_records( [(*variant.get_pyrange_record(), sgrna_id) for sgrna_id, variants in self._variants.items() for variant in variants], columns=['Chromosome', 'Start', 'End', 'sgrna_id']) df.sgrna_id = df.sgrna_id.astype('category') df['variant_id'] = get_id_column(df.shape[0]) self._ranges = PyRanges(df)
def dfs_min(draw): # nosec df = draw(better_dfs_min) # strand = draw(use_strand) df.loc[:, "End"] += df.Start df.insert(3, "Name", "a") df.insert(4, "Score", 0) # df.Start = df.Start.astype(np.int32) # df.End = df.End.astype(np.int32) # print(df.dtypes) # stranded = draw(st.booleans()) # if not strand: # df = df.drop("Strand", axis=1) gr = PyRanges(df, int64=True) # print(gr) # raise # gr = PyRanges(df) # do not sort like this, use pyranges sort # np.random.seed(draw(st.integers(min_value=0, max_value=int(1e6)))) # gr.df = df.reindex(np.random.permutation(df.index.values)) return gr
def chip_chr1(): c = """Chromosome Start End Strand chr1 5 7 + chr1 3 10 -""" return coverage(PyRanges(pd.read_table(StringIO(c), sep="\s+")))
def background_chr1(): c = """Chromosome Start End Strand chr1 1 4 + chr1 2 5 -""" return coverage(PyRanges(pd.read_table(StringIO(c), sep="\s+")))
def test_subtraction(gr, gr2, strandedness): print("gr\n", gr) print("gr2\n", gr2) bedtools_strand = {False: "", "same": "-s", "opposite": "-S"}[strandedness] result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.bed".format(temp_dir) f2 = "{}/f2.bed".format(temp_dir) gr.df.to_csv(f1, sep="\t", header=False, index=False) gr2.df.to_csv(f2, sep="\t", header=False, index=False) cmd = subtraction_command.format(bedtools_strand, f1, f2) result = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() bedtools_df = pd.read_table(StringIO(result), header=None, squeeze=True, names="Chromosome Start End Name Score Strand".split()) result = gr.subtraction(gr2, strandedness=strandedness) print("result\n", result) print("bedtools_df\n", PyRanges(bedtools_df)) if not bedtools_df.empty: assert_df_equal(result.df, bedtools_df) else: assert bedtools_df.empty == result.df.empty
def methylation_pyranges_from_csv(inputfile): colnames = ["Chromosome", "Start", "End", "calls", "methylated"] return PyRanges(pd.read_csv(inputfile, sep="\t", names=colnames, header=0, usecols=[0, 1, 2, 4, 5]))
def exonOverlap(args, df): exon_frame = pr.read_bed(args.exons) exon_overlap = PyRanges(df).join(exon_frame).drop(like="_b") if exon_overlap.df.empty: exon_calls = pd.DataFrame() else: exon_calls = exon_overlap.df.drop( columns=['Chromosome', 'Start', 'End']).rename( columns={ 'Name': 'gene', 'Score': 'OMIM_syndrome' }).drop_duplicates() # if args.genelist: # #gene_list = pd.read_csv(args.genelist, sep='\t', names=['Gene'], header=None) exon_calls = exon_calls.merge(args.genelist, on=['gene'], how='left') exon_calls.fillna(value={ 'score': 0, 'normalized_score': 0 }, inplace=True) exon_calls = exon_calls.sort_values(by='score', ascending=False) return exon_calls
def f1(names): df = pd.read_csv("tests/f1.bed", sep="\t", header=None, names="Chromosome Start End Name Score Strand".split()) return PyRanges(df)
def test_cds_context_repository_get_cds_genomic_ranges(strand, len5p, len3p, exp_cds_pre, exp_cds_suf): transcript_id = TID exon_index = 1 ccr = CDSContextRepository(PyRanges(df=CDS_RANGES_DF)) cds_pre, cds_suf = ccr.get_cds_genomic_ranges(transcript_id, strand, exon_index, GR, 0, 0, len5p, len3p) assert cds_pre == exp_cds_pre assert cds_suf == exp_cds_suf
def genomic_ranges_to_unstranded_pyranges( genomic_ranges: Iterable[GenomicRange]) -> PyRanges: return PyRanges(df=pd.DataFrame.from_records( chain([ genomic_range.as_unstranded_pyrange() for genomic_range in genomic_ranges ]), columns=['Chromosome', 'Start', 'End']).drop_duplicates())
def chip_10(names): df = pd.read_csv("tests/chip_10.bed", header=None, names=names, sep="\t") gr = PyRanges(df) assert gr.stranded return gr
def input_10(names): df = pd.read_table("tests/input_10.bed", header=None, names=names) gr = PyRanges(df) assert gr.stranded return gr
def introns(): df = pd.read_table("tests/intron.txt", sep="\t", header=None) print(df.head()) print(df.shape) df.columns = "Chromosome Start End".split() + list(df.columns[3:]) print(df.columns) return PyRanges(df)
def _getitem(self, val): if isinstance(val, list): dfs = _drop(self, keep=val).dfs elif isinstance(val, str): dfs = get_string(self, val) elif isinstance(val, tuple): dfs = get_tuple(self, val) elif isinstance(val, slice): dfs = get_slice(self, val) elif isinstance(val, dict): dfs = get_booldict(self, val) else: raise Exception("Not valid subsetter: {}".format(str(val))) if not dfs is None: return PyRanges(dfs) else: return PyRanges({})
def test_cds_context_repository_compute_cds_contexts(start, end, exp_ext_5, exp_ext_3): cds_ranges = PyRanges(df=CDS_RANGES_DF) chromosome = 'X' strand = '+' gr = GenomicRange(chromosome, start, end, strand) target_ranges = PyRanges(df=pd.DataFrame.from_records([ gr.as_pyrange() ], columns=PYRANGES_FIELDS)) target_ranges.is_const = False # Initialise repository ccr = CDSContextRepository(cds_ranges) ccr.register_target_ranges(target_ranges) # Compute CDS contexts ccr.compute_cds_contexts() # Check CDS contexts assert len(ccr._target_cds_contexts) == 1 exon_info, (ext_5, ext_3) = ccr._target_cds_contexts[gr] # Check exon information assert isinstance(exon_info, ExonInfo) assert exon_info.gene_id == GID assert exon_info.transcript_id == TID # Check CDS extension if exp_ext_5 is not None: assert ext_5 == GenomicRange(chromosome, *exp_ext_5, strand) else: assert ext_5 is None if exp_ext_3 is not None: assert ext_3 == GenomicRange(chromosome, *exp_ext_3, strand) else: assert ext_3 is None # Check information retrieval assert ccr.get_cds_extensions(gr) == (ext_5, ext_3) assert ccr.get_exon_info(gr) == exon_info assert ccr.get_transcript_info(gr) == exon_info.transcript_info
def expected_result_previous_bed_unstranded(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance 0 chr1 3 6 h 0 + 1 2 f 0 + 2 1 chr1 5 7 h 0 - 6 7 f 0 - 0 2 chr1 8 9 h 0 + 6 7 f 0 - 2""" df = pd.read_table(StringIO(c), sep=" ", header=0) print(df) return PyRanges(df)
def test_cds_context_repository_get_cds_by_index(): chromosome, strand, start, end, _, transcript_id, _, exon_index = CDS_RANGES[0] ccr = CDSContextRepository(PyRanges(df=CDS_RANGES_DF)) gr = ccr.get_cds_by_index(transcript_id, exon_index) assert gr.chromosome == chromosome assert gr.strand == strand assert gr.start == start assert gr.end == end with pytest.raises(Exception): ccr.get_cds_by_index(transcript_id, 999)
def read_bam_bin_counts(bins: PyRanges, bams: Dict[str, str], excluded: PyRanges = None, **kwargs) -> AnnData: """ Count reads in bins from bams Parameters ---------- bins : pyranges.PyRanges bins in which to count reads bams : Dict[Str] bam filenames with cell ids as keys excluded: PyRanges excluded genomic regions to filter reads Returns ------- ad.AnnData binned read counts """ bin_data = _convert_pyranges(bins) bin_data = _add_bin_index(bin_data) cn_matrix = {} for cell_id, cell_bam in bams.items(): logging.info(f"reading {cell_bam}") bam_data = pr.read_bam(cell_bam, **kwargs) if excluded is not None: logging.info("excluding reads") bam_data = bam_data.intersect(excluded, invert=True) logging.info(f"count overlaps") bam_data = bam_data.intersect(bins, how='containment') read_counts = bins.count_overlaps(bam_data, overlap_col='reads') read_counts = _convert_pyranges(read_counts) read_counts = _add_bin_index(read_counts) cn_matrix[cell_id] = read_counts['reads'] cn_matrix = pd.DataFrame(cn_matrix) cell_data = pd.DataFrame({'cell_id': cn_matrix.columns.values}).set_index('cell_id') adata = ad.AnnData( cn_matrix.T, obs=cell_data, var=bin_data, ) return adata
def calc_windowed_seg_sites(self, chrom=0, L=1e3, filt_rec=True, mask=None): """Calculate windowed estimates of segregating sites. Arguments: * chrom: identifier for the chromosome * L: length of independent locus * filt_rec: filter recombination * mask: bed file for the underlying mask """ assert self.chrom_pos_dict is not None phys_pos = self.chrom_physpos_dict[chrom] rec_pos = self.chrom_pos_dict[chrom] weights = self.chrom_weight_dict[chrom] if filt_rec: diff = np.abs(rec_pos[:-1] - rec_pos[1:]) idx = np.where(diff != 0)[0] phys_pos = phys_pos[idx] rec_pos = rec_pos[idx] weights = weights[idx] if mask is not None: phys_pos = phys_pos.astype(np.float64) df_mask = pyranges.read_bed(mask) df_pos = PyRanges(chromosomes=chrom, starts=phys_pos, ends=(phys_pos + 1)) cov_sites = df_pos.coverage(df_mask) sites_idx = np.array(cov_sites.FractionOverlaps.astype(np.float32)) idx = np.where(sites_idx > 0.0)[0] phys_pos[idx] = np.nan # 1. Setup the bins for the analysis bins = np.arange(np.nanmin(phys_pos), np.nanmax(phys_pos), L) windowed_vars, bin_edges = np.histogram( phys_pos[~np.isnan(phys_pos)], bins=bins, weights=weights[~np.isnan(phys_pos)], ) bin_edges = bin_edges.astype(np.uint32) # Interpolate the midpoints of the recombination bins f = interpolate.interp1d(phys_pos, rec_pos) midpts = bin_edges[:-1] + (bin_edges[1:] - bin_edges[:-1]) / 2 rec_midpts = f(midpts) # Calculate the weightings from the mask as needed ... mask_weights = np.ones(rec_midpts.size) if mask is not None: # Mask must be a bedfile df_windows = PyRanges( chromosomes=chrom, starts=bin_edges[:-1], ends=bin_edges[1:] ) df_mask = pyranges.read_bed(mask) cov = df_windows.coverage(df_mask) mask_weights = np.array(cov.FractionOverlaps.astype(np.float32)) # Set the mask weights to scale up the fraction that may be missing! mask_weights = 1.0 / (1.0 - mask_weights) mask_weights[np.isinf(mask_weights)] = np.nan # Stacking all of the data to make sure that we can use it later on tot_data = np.vstack([windowed_vars, bin_edges[1:], rec_midpts, mask_weights]) self.chrom_total_dict[chrom] = tot_data
def expected_result_previous_bed_opposite_stranded(names): c = """chr1 8 9 h 0 + 6 7 f 0 - 2 chr1 5 7 h 0 - 1 2 f 0 + 4""" df = pd.read_table( StringIO(c), sep=" ", header=None, names= "Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance" .split()) print(df) return PyRanges(df)
def checkParentsOverlap(sample_copy, parent_copy, filtered_sample_frame, args, inheritance): if args.type == 'singleton' or ( args.type == 'duo' and inheritance == 'Found_in_Father' and args.mother_duo) or (args.type == 'duo' and inheritance == 'Found_in_Mother' and args.father_duo): # Initialize columns and set to -1 if parents file not provided filtered_sample_frame[inheritance] = 'None' return filtered_sample_frame colnames = [ 'SmapEntryID', 'RefcontigID1', 'RefcontigID2', 'RefStartPos', 'RefEndPos', 'QryStartPos', 'QryEndPos', 'Confidence', 'Type', 'Zygosity', 'Genotype' ] denovo_parent_frame = reciprocal_overlap(PyRanges(sample_copy), PyRanges(parent_copy))[colnames] if denovo_parent_frame.empty: filtered_sample_frame[inheritance] = "False" else: parent_filtered_sample_frame = pd.merge(filtered_sample_frame, denovo_parent_frame, on=None, how='left', indicator=inheritance) parent_filtered_sample_frame[inheritance] = np.where( parent_filtered_sample_frame[inheritance] == 'both', True, False) filtered_sample_frame = parent_filtered_sample_frame.drop_duplicates( ).reset_index(drop=True) return filtered_sample_frame